mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Saves work on a new management comment to re-ocr a file
This commit is contained in:
parent
4f176682dc
commit
dfd16c5187
69
src/documents/management/commands/document_redo_ocr.py
Normal file
69
src/documents/management/commands/document_redo_ocr.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
|
from django.core.exceptions import ObjectDoesNotExist
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from documents.models import Document
|
||||||
|
from documents.parsers import DocumentParser
|
||||||
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
|
from documents.parsers import ParseError
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
|
||||||
|
help = """
|
||||||
|
This will rename all documents to match the latest filename format.
|
||||||
|
""".replace(
|
||||||
|
" ",
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument(
|
||||||
|
"documents",
|
||||||
|
nargs="+",
|
||||||
|
help="Document primary keys for re-processing OCR on",
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
|
||||||
|
logging.getLogger().handlers[0].level = logging.ERROR
|
||||||
|
|
||||||
|
all_docs = Document.objects.all()
|
||||||
|
|
||||||
|
for doc_pk in args.documents:
|
||||||
|
try:
|
||||||
|
self.stdout.write(f"Parsing document {doc_pk}")
|
||||||
|
doc: Document = all_docs.get(pk=doc_pk)
|
||||||
|
except ObjectDoesNotExist:
|
||||||
|
self.stdout.write(self.style.ERROR(f"Document {doc_pk} does not exist"))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the correct parser for this mime type
|
||||||
|
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
|
||||||
|
doc.mime_type,
|
||||||
|
)
|
||||||
|
document_parser: DocumentParser = parser_class(
|
||||||
|
"redo-ocr",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a file path to copy the original file to for working on
|
||||||
|
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
|
||||||
|
|
||||||
|
shutil.copy(doc.source_path, temp_file)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Try to re-parse the document into text
|
||||||
|
document_parser.parse(str(temp_file), doc.mime_type)
|
||||||
|
|
||||||
|
doc.content = document_parser.get_text()
|
||||||
|
doc.save()
|
||||||
|
|
||||||
|
except ParseError as e:
|
||||||
|
self.stdout.write(self.style.ERROR(f"Error parsing document: {e}"))
|
||||||
|
finally:
|
||||||
|
# Remove the file path if it was created
|
||||||
|
if temp_file.exists() and temp_file.is_file():
|
||||||
|
temp_file.unlink()
|
Loading…
x
Reference in New Issue
Block a user