diff --git a/src/documents/management/commands/document_redo_ocr.py b/src/documents/management/commands/document_redo_ocr.py new file mode 100644 index 000000000..c35639487 --- /dev/null +++ b/src/documents/management/commands/document_redo_ocr.py @@ -0,0 +1,69 @@ +import logging +import shutil +from pathlib import Path +from typing import Type + +from django.core.exceptions import ObjectDoesNotExist +from django.core.management.base import BaseCommand +from documents.models import Document +from documents.parsers import DocumentParser +from documents.parsers import get_parser_class_for_mime_type +from documents.parsers import ParseError + + +class Command(BaseCommand): + + help = """ + This will rename all documents to match the latest filename format. + """.replace( + " ", + "", + ) + + def add_arguments(self, parser): + parser.add_argument( + "documents", + nargs="+", + help="Document primary keys for re-processing OCR on", + ) + + def handle(self, *args, **options): + + logging.getLogger().handlers[0].level = logging.ERROR + + all_docs = Document.objects.all() + + for doc_pk in args.documents: + try: + self.stdout.write(f"Parsing document {doc_pk}") + doc: Document = all_docs.get(pk=doc_pk) + except ObjectDoesNotExist: + self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist")) + continue + + # Get the correct parser for this mime type + parser_class: Type[DocumentParser] = get_parser_class_for_mime_type( + doc.mime_type, + ) + document_parser: DocumentParser = parser_class( + "redo-ocr", + ) + + # Create a file path to copy the original file to for working on + temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve() + + shutil.copy(doc.source_path, temp_file) + + try: + # Try to re-parse the document into text + document_parser.parse(str(temp_file), doc.mime_type) + + doc.content = document_parser.get_text() + doc.save() + + except ParseError as e: + self.stdout.write(self.style.ERROR(f"Error parsing document: {e}")) + finally: + # Remove the file path if it was created + if temp_file.exists() and temp_file.is_file(): + temp_file.unlink()