Implements a better re-do of OCR by making the document archiver function common. Actually creates updated file now

2025-12-16 01:31:09 -06:00 · 2022-08-21 18:20:59 -07:00
parent 87cf215f3e
commit 059242b7dc
5 changed files with 70 additions and 146 deletions
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -1,6 +1,8 @@
+import hashlib
 import logging
 import os
 import shutil
+import uuid
 from pathlib import Path
 from typing import Type

@@ -8,7 +10,7 @@ import tqdm
 from asgiref.sync import async_to_sync
 from channels.layers import get_channel_layer
 from django.conf import settings
-from django.core.exceptions import ObjectDoesNotExist
+from django.db import transaction
 from django.db.models.signals import post_save
 from documents import barcodes
 from documents import index
@@ -17,6 +19,8 @@ from documents.classifier import DocumentClassifier
 from documents.classifier import load_classifier
 from documents.consumer import Consumer
 from documents.consumer import ConsumerError
+from documents.file_handling import create_source_path_directory
+from documents.file_handling import generate_unique_filename
 from documents.models import Correspondent
 from documents.models import Document
 from documents.models import DocumentType
@@ -24,8 +28,8 @@ from documents.models import StoragePath
 from documents.models import Tag
 from documents.parsers import DocumentParser
 from documents.parsers import get_parser_class_for_mime_type
-from documents.parsers import ParseError
 from documents.sanity_checker import SanityCheckFailedException
+from filelock import FileLock
 from whoosh.writing import AsyncWriter


@@ -213,44 +217,62 @@ def bulk_update_documents(document_ids):
            index.update_document(writer, doc)


-def redo_ocr(document_ids):
-    all_docs = Document.objects.all()
+def update_document_archive_file(document_id):
+    """
+    Re-creates the archive file of a document, including new OCR content and thumbnail
+    """
+    document = Document.objects.get(id=document_id)

-    for doc_pk in document_ids:
-        try:
-            logger.info(f"Parsing document {doc_pk}")
-            doc: Document = all_docs.get(pk=doc_pk)
-        except ObjectDoesNotExist:
-            logger.error(f"Document {doc_pk} does not exist")
-            continue
+    mime_type = document.mime_type

-        # Get the correct parser for this mime type
-        parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
-            doc.mime_type,
+    parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
+
+    if not parser_class:
+        logger.error(
+            f"No parser found for mime type {mime_type}, cannot "
+            f"archive document {document} (ID: {document_id})",
        )
-        document_parser: DocumentParser = parser_class(
-            "redo-ocr",
+        return
+
+    parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
+
+    try:
+        parser.parse(document.source_path, mime_type, document.get_public_filename())
+
+        thumbnail = parser.get_thumbnail(
+            document.source_path,
+            mime_type,
+            document.get_public_filename(),
        )

-        # Create a file path to copy the original file to for working on
-        temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
+        if parser.get_archive_path():
+            with transaction.atomic():
+                with open(parser.get_archive_path(), "rb") as f:
+                    checksum = hashlib.md5(f.read()).hexdigest()
+                # I'm going to save first so that in case the file move
+                # fails, the database is rolled back.
+                # We also don't use save() since that triggers the filehandling
+                # logic, and we don't want that yet (file not yet in place)
+                document.archive_filename = generate_unique_filename(
+                    document,
+                    archive_filename=True,
+                )
+                Document.objects.filter(pk=document.pk).update(
+                    archive_checksum=checksum,
+                    content=parser.get_text(),
+                    archive_filename=document.archive_filename,
+                )
+                with FileLock(settings.MEDIA_LOCK):
+                    create_source_path_directory(document.archive_path)
+                    shutil.move(parser.get_archive_path(), document.archive_path)
+                    shutil.move(thumbnail, document.thumbnail_path)

-        shutil.copy(doc.source_path, temp_file)
+            with index.open_index_writer() as writer:
+                index.update_document(writer, document)

-        try:
-            logger.info(
-                f"Using {type(document_parser).__name__} for document",
-            )
-            # Try to re-parse the document into text
-            document_parser.parse(str(temp_file), doc.mime_type)
-
-            doc.content = document_parser.get_text()
-            doc.save()
-            logger.info("Document OCR updated")
-
-        except ParseError as e:
-            logger.error(f"Error parsing document: {e}")
-        finally:
-            # Remove the file path if it was created
-            if temp_file.exists() and temp_file.is_file():
-                temp_file.unlink()
+    except Exception:
+        logger.exception(
+            f"Error while parsing document {document} " f"(ID: {document_id})",
+        )
+    finally:
+        parser.cleanup()