Saves work on a new management comment to re-ocr a file

2026-01-30 23:08:59 -06:00 · 2022-05-31 09:33:09 -07:00
parent 4f176682dc
commit dfd16c5187
1 changed files with 69 additions and 0 deletions
--- a/src/documents/management/commands/document_redo_ocr.py
+++ b/src/documents/management/commands/document_redo_ocr.py
@@ -0,0 +1,69 @@
 import logging
 import shutil
 from pathlib import Path
 from typing import Type
 from django.core.exceptions import ObjectDoesNotExist
 from django.core.management.base import BaseCommand
 from documents.models import Document
 from documents.parsers import DocumentParser
 from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import ParseError
 class Command(BaseCommand):
    help = """
        This will rename all documents to match the latest filename format.
    """.replace(
        "    ",
        "",
    )
    def add_arguments(self, parser):
        parser.add_argument(
            "documents",
            nargs="+",
            help="Document primary keys for re-processing OCR on",
        )
    def handle(self, *args, **options):
        logging.getLogger().handlers[0].level = logging.ERROR
        all_docs = Document.objects.all()
        for doc_pk in args.documents:
            try:
                self.stdout.write(f"Parsing document {doc_pk}")
                doc: Document = all_docs.get(pk=doc_pk)
            except ObjectDoesNotExist:
                self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
                continue
            # Get the correct parser for this mime type
            parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
                doc.mime_type,
            )
            document_parser: DocumentParser = parser_class(
                "redo-ocr",
            )
            # Create a file path to copy the original file to for working on
            temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
            shutil.copy(doc.source_path, temp_file)
            try:
                # Try to re-parse the document into text
                document_parser.parse(str(temp_file), doc.mime_type)
                doc.content = document_parser.get_text()
                doc.save()
            except ParseError as e:
                self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
            finally:
                # Remove the file path if it was created
                if temp_file.exists() and temp_file.is_file():
                    temp_file.unlink()