Merge pull request #1451 from paperless-ngx/feature-better-redo-ocr

Feature: Even better re-do of OCR
2026-02-03 23:22:42 -06:00 · 2022-08-25 17:01:54 -07:00
parent 4f6ad5369b 059242b7dc
commit 75991f4268
5 changed files with 70 additions and 146 deletions
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@@ -1,85 +1,18 @@
-import hashlib
 import logging
 import multiprocessing
 import os
-import shutil
-import uuid

 import tqdm
 from django import db
 from django.conf import settings
 from django.core.management.base import BaseCommand
-from django.db import transaction
 from documents.models import Document
-from filelock import FileLock
-
-from ... import index
-from ...file_handling import create_source_path_directory
-from ...file_handling import generate_unique_filename
-from ...parsers import get_parser_class_for_mime_type
+from documents.tasks import update_document_archive_file


 logger = logging.getLogger("paperless.management.archiver")


-def handle_document(document_id):
-    document = Document.objects.get(id=document_id)
-
-    mime_type = document.mime_type
-
-    parser_class = get_parser_class_for_mime_type(mime_type)
-
-    if not parser_class:
-        logger.error(
-            f"No parser found for mime type {mime_type}, cannot "
-            f"archive document {document} (ID: {document_id})",
-        )
-        return
-
-    parser = parser_class(logging_group=uuid.uuid4())
-
-    try:
-        parser.parse(document.source_path, mime_type, document.get_public_filename())
-
-        thumbnail = parser.get_thumbnail(
-            document.source_path,
-            mime_type,
-            document.get_public_filename(),
-        )
-
-        if parser.get_archive_path():
-            with transaction.atomic():
-                with open(parser.get_archive_path(), "rb") as f:
-                    checksum = hashlib.md5(f.read()).hexdigest()
-                # I'm going to save first so that in case the file move
-                # fails, the database is rolled back.
-                # We also don't use save() since that triggers the filehandling
-                # logic, and we don't want that yet (file not yet in place)
-                document.archive_filename = generate_unique_filename(
-                    document,
-                    archive_filename=True,
-                )
-                Document.objects.filter(pk=document.pk).update(
-                    archive_checksum=checksum,
-                    content=parser.get_text(),
-                    archive_filename=document.archive_filename,
-                )
-                with FileLock(settings.MEDIA_LOCK):
-                    create_source_path_directory(document.archive_path)
-                    shutil.move(parser.get_archive_path(), document.archive_path)
-                    shutil.move(thumbnail, document.thumbnail_path)
-
-            with index.open_index_writer() as writer:
-                index.update_document(writer, document)
-
-    except Exception:
-        logger.exception(
-            f"Error while parsing document {document} " f"(ID: {document_id})",
-        )
-    finally:
-        parser.cleanup()
-
-
 class Command(BaseCommand):

    help = """
@@ -146,7 +79,7 @@ class Command(BaseCommand):
            with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
                list(
                    tqdm.tqdm(
-                        pool.imap_unordered(handle_document, document_ids),
+                        pool.imap_unordered(update_document_archive_file, document_ids),
                        total=len(document_ids),
                        disable=options["no_progress_bar"],
                    ),
--- a/src/documents/management/commands/document_redo_ocr.py
+++ b/src/documents/management/commands/document_redo_ocr.py
@@ -1,35 +0,0 @@
-import tqdm
-from django.core.management.base import BaseCommand
-from documents.tasks import redo_ocr
-
-
-class Command(BaseCommand):
-
-    help = """
-        This will rename all documents to match the latest filename format.
-    """.replace(
-        "    ",
-        "",
-    )
-
-    def add_arguments(self, parser):
-
-        parser.add_argument(
-            "--no-progress-bar",
-            default=False,
-            action="store_true",
-            help="If set, the progress bar will not be shown",
-        )
-
-        parser.add_argument(
-            "documents",
-            nargs="+",
-            help="Document primary keys for re-processing OCR on",
-        )
-
-    def handle(self, *args, **options):
-        doc_pks = tqdm.tqdm(
-            options["documents"],
-            disable=options["no_progress_bar"],
-        )
-        redo_ocr(doc_pks)