A handy script to redo ocr on all documents,

2026-02-11 23:59:31 -06:00 · 2020-11-03 14:04:11 +01:00
parent 5686f76ef2
commit 9757e261f2
3 changed files with 95 additions and 41 deletions
--- a/src/documents/management/commands/document_rerun_ocr.py
+++ b/src/documents/management/commands/document_rerun_ocr.py
@@ -0,0 +1,60 @@
+import argparse
+import threading
+from multiprocessing import Pool
+from multiprocessing.pool import ThreadPool
+
+from django.core.management.base import BaseCommand
+
+from documents.consumer import Consumer
+from documents.models import Log, Document
+from documents.parsers import get_parser_class
+
+
+def process_document(doc):
+    parser_class = get_parser_class(doc.file_name)
+    if not parser_class:
+        print("no parser available")
+    else:
+        print("Parser: {}".format(parser_class.__name__))
+        parser = parser_class(doc.source_path, None)
+        try:
+            text = parser.get_text()
+            doc.content = text
+            doc.save()
+        finally:
+            parser.cleanup()
+
+
+def document_index(value):
+    ivalue = int(value)
+    if not (1 <= ivalue <= Document.objects.count()):
+        raise argparse.ArgumentTypeError(
+            "{} is not a valid document index (out of range)".format(value))
+
+    return ivalue
+
+
+class Command(BaseCommand):
+
+    help = "Performs OCR on all documents again!"
+
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "-s", "--start_index",
+            default=None,
+            type=document_index
+        )
+
+    def handle(self, *args, **options):
+
+        docs = Document.objects.all().order_by("added")
+
+        indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
+
+        for i in indices:
+            doc = docs[i]
+            print("==================================")
+            print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
+            print("==================================")
+            process_document(doc)