From ab761e837c4be4974f699c8c97560a4291a8d298 Mon Sep 17 00:00:00 2001 From: Trenton Holmes Date: Sun, 21 Aug 2022 18:20:59 -0700 Subject: [PATCH] Implements a better re-do of OCR by making the document archiver function common. Actually creates updated file now --- src/documents/bulk_edit.py | 6 +- .../management/commands/document_archiver.py | 71 +------------- .../management/commands/document_redo_ocr.py | 35 ------- src/documents/tasks.py | 94 ++++++++++++------- src/documents/tests/test_management.py | 10 +- 5 files changed, 70 insertions(+), 146 deletions(-) delete mode 100644 src/documents/management/commands/document_redo_ocr.py diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py index babd5f3b4..0cf0daf3e 100644 --- a/src/documents/bulk_edit.py +++ b/src/documents/bulk_edit.py @@ -122,6 +122,10 @@ def delete(doc_ids): def redo_ocr(doc_ids): - async_task("documents.tasks.redo_ocr", document_ids=doc_ids) + for document_id in doc_ids: + async_task( + "documents.tasks.update_document_archive_file", + document_id=document_id, + ) return "OK" diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index c51f1baeb..fa78a1963 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -1,85 +1,18 @@ -import hashlib import logging import multiprocessing import os -import shutil -import uuid import tqdm from django import db from django.conf import settings from django.core.management.base import BaseCommand -from django.db import transaction from documents.models import Document -from filelock import FileLock - -from ... import index -from ...file_handling import create_source_path_directory -from ...file_handling import generate_unique_filename -from ...parsers import get_parser_class_for_mime_type +from documents.tasks import update_document_archive_file logger = logging.getLogger("paperless.management.archiver") -def handle_document(document_id): - document = Document.objects.get(id=document_id) - - mime_type = document.mime_type - - parser_class = get_parser_class_for_mime_type(mime_type) - - if not parser_class: - logger.error( - f"No parser found for mime type {mime_type}, cannot " - f"archive document {document} (ID: {document_id})", - ) - return - - parser = parser_class(logging_group=uuid.uuid4()) - - try: - parser.parse(document.source_path, mime_type, document.get_public_filename()) - - thumbnail = parser.get_thumbnail( - document.source_path, - mime_type, - document.get_public_filename(), - ) - - if parser.get_archive_path(): - with transaction.atomic(): - with open(parser.get_archive_path(), "rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() - # I'm going to save first so that in case the file move - # fails, the database is rolled back. - # We also don't use save() since that triggers the filehandling - # logic, and we don't want that yet (file not yet in place) - document.archive_filename = generate_unique_filename( - document, - archive_filename=True, - ) - Document.objects.filter(pk=document.pk).update( - archive_checksum=checksum, - content=parser.get_text(), - archive_filename=document.archive_filename, - ) - with FileLock(settings.MEDIA_LOCK): - create_source_path_directory(document.archive_path) - shutil.move(parser.get_archive_path(), document.archive_path) - shutil.move(thumbnail, document.thumbnail_path) - - with index.open_index_writer() as writer: - index.update_document(writer, document) - - except Exception: - logger.exception( - f"Error while parsing document {document} " f"(ID: {document_id})", - ) - finally: - parser.cleanup() - - class Command(BaseCommand): help = """ @@ -146,7 +79,7 @@ class Command(BaseCommand): with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool: list( tqdm.tqdm( - pool.imap_unordered(handle_document, document_ids), + pool.imap_unordered(update_document_archive_file, document_ids), total=len(document_ids), disable=options["no_progress_bar"], ), diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py deleted file mode 100644 index 1e44e6134..000000000 --- a/src/documents/management/commands/document_redo_ocr.py +++ /dev/null @@ -1,35 +0,0 @@ -import tqdm -from django.core.management.base import BaseCommand -from documents.tasks import redo_ocr - - -class Command(BaseCommand): - - help = """ - This will rename all documents to match the latest filename format. - """.replace( - " ", - "", - ) - - def add_arguments(self, parser): - - parser.add_argument( - "--no-progress-bar", - default=False, - action="store_true", - help="If set, the progress bar will not be shown", - ) - - parser.add_argument( - "documents", - nargs="+", - help="Document primary keys for re-processing OCR on", - ) - - def handle(self, *args, **options): - doc_pks = tqdm.tqdm( - options["documents"], - disable=options["no_progress_bar"], - ) - redo_ocr(doc_pks) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 35404587d..b1793e760 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -1,6 +1,8 @@ +import hashlib import logging import os import shutil +import uuid from pathlib import Path from typing import Type @@ -8,7 +10,7 @@ import tqdm from asgiref.sync import async_to_sync from channels.layers import get_channel_layer from django.conf import settings -from django.core.exceptions import ObjectDoesNotExist +from django.db import transaction from django.db.models.signals import post_save from documents import barcodes from documents import index @@ -17,6 +19,8 @@ from documents.classifier import DocumentClassifier from documents.classifier import load_classifier from documents.consumer import Consumer from documents.consumer import ConsumerError +from documents.file_handling import create_source_path_directory +from documents.file_handling import generate_unique_filename from documents.models import Correspondent from documents.models import Document from documents.models import DocumentType @@ -24,8 +28,8 @@ from documents.models import StoragePath from documents.models import Tag from documents.parsers import DocumentParser from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import ParseError from documents.sanity_checker import SanityCheckFailedException +from filelock import FileLock from whoosh.writing import AsyncWriter @@ -213,44 +217,62 @@ def bulk_update_documents(document_ids): index.update_document(writer, doc) -def redo_ocr(document_ids): - all_docs = Document.objects.all() +def update_document_archive_file(document_id): + """ + Re-creates the archive file of a document, including new OCR content and thumbnail + """ + document = Document.objects.get(id=document_id) - for doc_pk in document_ids: - try: - logger.info(f"Parsing document {doc_pk}") - doc: Document = all_docs.get(pk=doc_pk) - except ObjectDoesNotExist: - logger.error(f"Document {doc_pk} does not exist") - continue + mime_type = document.mime_type - # Get the correct parser for this mime type - parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( - doc.mime_type, + parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type) + + if not parser_class: + logger.error( + f"No parser found for mime type {mime_type}, cannot " + f"archive document {document} (ID: {document_id})", ) - document_parser: DocumentParser = parser_class( - "redo-ocr", + return + + parser: DocumentParser = parser_class(logging_group=uuid.uuid4()) + + try: + parser.parse(document.source_path, mime_type, document.get_public_filename()) + + thumbnail = parser.get_thumbnail( + document.source_path, + mime_type, + document.get_public_filename(), ) - # Create a file path to copy the original file to for working on - temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() + if parser.get_archive_path(): + with transaction.atomic(): + with open(parser.get_archive_path(), "rb") as f: + checksum = hashlib.md5(f.read()).hexdigest() + # I'm going to save first so that in case the file move + # fails, the database is rolled back. + # We also don't use save() since that triggers the filehandling + # logic, and we don't want that yet (file not yet in place) + document.archive_filename = generate_unique_filename( + document, + archive_filename=True, + ) + Document.objects.filter(pk=document.pk).update( + archive_checksum=checksum, + content=parser.get_text(), + archive_filename=document.archive_filename, + ) + with FileLock(settings.MEDIA_LOCK): + create_source_path_directory(document.archive_path) + shutil.move(parser.get_archive_path(), document.archive_path) + shutil.move(thumbnail, document.thumbnail_path) - shutil.copy(doc.source_path, temp_file) + with index.open_index_writer() as writer: + index.update_document(writer, document) - try: - logger.info( - f"Using {type(document_parser).__name__} for document", - ) - # Try to re-parse the document into text - document_parser.parse(str(temp_file), doc.mime_type) - - doc.content = document_parser.get_text() - doc.save() - logger.info("Document OCR updated") - - except ParseError as e: - logger.error(f"Error parsing document: {e}") - finally: - # Remove the file path if it was created - if temp_file.exists() and temp_file.is_file(): - temp_file.unlink() + except Exception: + logger.exception( + f"Error while parsing document {document} " f"(ID: {document_id})", + ) + finally: + parser.cleanup() diff --git a/src/documents/tests/test_management.py b/src/documents/tests/test_management.py index 76a5459b5..fe217676b 100644 --- a/src/documents/tests/test_management.py +++ b/src/documents/tests/test_management.py @@ -10,8 +10,8 @@ from django.core.management import call_command from django.test import override_settings from django.test import TestCase from documents.file_handling import generate_filename -from documents.management.commands.document_archiver import handle_document from documents.models import Document +from documents.tasks import update_document_archive_file from documents.tests.utils import DirectoriesMixin @@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, TestCase): os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"), ) - handle_document(doc.pk) + update_document_archive_file(doc.pk) doc = Document.objects.get(id=doc.id) @@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, TestCase): doc.save() shutil.copy(sample_file, doc.source_path) - handle_document(doc.pk) + update_document_archive_file(doc.pk) doc = Document.objects.get(id=doc.id) @@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, TestCase): os.path.join(self.dirs.originals_dir, f"document_01.pdf"), ) - handle_document(doc2.pk) - handle_document(doc1.pk) + update_document_archive_file(doc2.pk) + update_document_archive_file(doc1.pk) doc1 = Document.objects.get(id=doc1.id) doc2 = Document.objects.get(id=doc2.id)