diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py index 3ead5a485..1e44e6134 100644 --- a/src/documents/management/commands/document_redo_ocr.py +++ b/src/documents/management/commands/document_redo_ocr.py @@ -1,14 +1,6 @@ -import shutil -from pathlib import Path -from typing import Type - import tqdm -from django.core.exceptions import ObjectDoesNotExist from django.core.management.base import BaseCommand -from documents.models import Document -from documents.parsers import DocumentParser -from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import ParseError +from documents.tasks import redo_ocr class Command(BaseCommand): @@ -36,47 +28,8 @@ class Command(BaseCommand): ) def handle(self, *args, **options): - - all_docs = Document.objects.all() - - for doc_pk in tqdm.tqdm( + doc_pks = tqdm.tqdm( options["documents"], disable=options["no_progress_bar"], - ): - try: - self.stdout.write(f"Parsing document {doc_pk}") - doc: Document = all_docs.get(pk=doc_pk) - except ObjectDoesNotExist: - self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist")) - continue - - # Get the correct parser for this mime type - parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( - doc.mime_type, - ) - document_parser: DocumentParser = parser_class( - "redo-ocr", - ) - - # Create a file path to copy the original file to for working on - temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() - - shutil.copy(doc.source_path, temp_file) - - try: - self.stdout.write( - f"Using {type(document_parser).__name__} for document", - ) - # Try to re-parse the document into text - document_parser.parse(str(temp_file), doc.mime_type) - - doc.content = document_parser.get_text() - doc.save() - self.stdout.write("Document OCR updated") - - except ParseError as e: - self.stdout.write(self.style.ERROR(f"Error parsing document: {e}")) - finally: - # Remove the file path if it was created - if temp_file.exists() and temp_file.is_file(): - temp_file.unlink() + ) + redo_ocr(doc_pks) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 4c57b2eee..1070471ba 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -2,13 +2,16 @@ import logging import os import shutil import tempfile +from pathlib import Path from typing import List # for type hinting. Can be removed, if only Python >3.8 is used +from typing import Type import magic import tqdm from asgiref.sync import async_to_sync from channels.layers import get_channel_layer from django.conf import settings +from django.core.exceptions import ObjectDoesNotExist from django.db.models.signals import post_save from documents import index from documents import sanity_checker @@ -21,6 +24,9 @@ from documents.models import Document from documents.models import DocumentType from documents.models import StoragePath from documents.models import Tag +from documents.parsers import DocumentParser +from documents.parsers import get_parser_class_for_mime_type +from documents.parsers import ParseError from documents.sanity_checker import SanityCheckFailedException from pdf2image import convert_from_path from pikepdf import Pdf @@ -359,3 +365,46 @@ def bulk_update_documents(document_ids): with AsyncWriter(ix) as writer: for doc in documents: index.update_document(writer, doc) + + +def redo_ocr(document_ids): + all_docs = Document.objects.all() + + for doc_pk in document_ids: + try: + logger.info(f"Parsing document {doc_pk}") + doc: Document = all_docs.get(pk=doc_pk) + except ObjectDoesNotExist: + logger.error(f"Document {doc_pk} does not exist") + continue + + # Get the correct parser for this mime type + parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( + doc.mime_type, + ) + document_parser: DocumentParser = parser_class( + "redo-ocr", + ) + + # Create a file path to copy the original file to for working on + temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() + + shutil.copy(doc.source_path, temp_file) + + try: + logger.info( + f"Using {type(document_parser).__name__} for document", + ) + # Try to re-parse the document into text + document_parser.parse(str(temp_file), doc.mime_type) + + doc.content = document_parser.get_text() + doc.save() + logger.info("Document OCR updated") + + except ParseError as e: + logger.error(f"Error parsing document: {e}") + finally: + # Remove the file path if it was created + if temp_file.exists() and temp_file.is_file(): + temp_file.unlink()