Saves work on a new management comment to re-ocr a file

This commit is contained in:
Trenton Holmes 2022-05-31 09:33:09 -07:00 committed by Michael Shamoon
parent 4f176682dc
commit dfd16c5187

View File

@ -0,0 +1,69 @@
import logging
import shutil
from pathlib import Path
from typing import Type
from django.core.exceptions import ObjectDoesNotExist
from django.core.management.base import BaseCommand
from documents.models import Document
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import ParseError
class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(
" ",
"",
)
def add_arguments(self, parser):
parser.add_argument(
"documents",
nargs="+",
help="Document primary keys for re-processing OCR on",
)
def handle(self, *args, **options):
logging.getLogger().handlers[0].level = logging.ERROR
all_docs = Document.objects.all()
for doc_pk in args.documents:
try:
self.stdout.write(f"Parsing document {doc_pk}")
doc: Document = all_docs.get(pk=doc_pk)
except ObjectDoesNotExist:
self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
continue
# Get the correct parser for this mime type
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
doc.mime_type,
)
document_parser: DocumentParser = parser_class(
"redo-ocr",
)
# Create a file path to copy the original file to for working on
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
shutil.copy(doc.source_path, temp_file)
try:
# Try to re-parse the document into text
document_parser.parse(str(temp_file), doc.mime_type)
doc.content = document_parser.get_text()
doc.save()
except ParseError as e:
self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
finally:
# Remove the file path if it was created
if temp_file.exists() and temp_file.is_file():
temp_file.unlink()