A handy script to redo ocr on all documents,

2025-07-30 18:27:45 -05:00 · 2020-11-03 14:04:11 +01:00
parent 6f3d25d7b1
commit f4cebda085
3 changed files with 95 additions and 41 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -12,9 +12,8 @@ from django.utils import timezone
 from paperless.db import GnuPG
 from .classifier import DocumentClassifier
 from .models import Document, FileInfo
-from .parsers import ParseError
+from .parsers import ParseError, get_parser_class
 from .signals import (
    document_consumer_declaration,
    document_consumption_finished,
    document_consumption_started
 )
@@ -61,15 +60,6 @@ class Consumer:
            raise ConsumerError(
                "Consumption directory {} does not exist".format(self.consume))
        self.parsers = []
        for response in document_consumer_declaration.send(self):
            self.parsers.append(response[1])
        if not self.parsers:
            raise ConsumerError(
                "No parsers could be found, not even the default.  "
                "This is a problem."
            )
    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
@@ -82,6 +72,8 @@ class Consumer:
        Return True if file was consumed
        """
        self.logging_group = uuid.uuid4()
        if not re.match(FileInfo.REGEXES["title"], file):
            return False
@@ -96,13 +88,13 @@ class Consumer:
        self.log("info", "Consuming {}".format(doc))
-        parser_class = self._get_parser_class(doc)
+        parser_class = get_parser_class(doc)
        if not parser_class:
            self.log(
                "error", "No parsers could be found for {}".format(doc))
            return False
-
+        else:
-        self.logging_group = uuid.uuid4()
+            self.log("info", "Parser: {}".format(parser_class.__name__))
        document_consumption_started.send(
@@ -114,6 +106,7 @@ class Consumer:
        document_parser = parser_class(doc, self.logging_group)
        try:
            self.log("info", "Generating thumbnail for {}...".format(doc))
            thumbnail = document_parser.get_optimised_thumbnail()
            date = document_parser.get_date()
            document = self._store(
@@ -154,31 +147,6 @@ class Consumer:
            )
            return True
    def _get_parser_class(self, doc):
        """
        Determine the appropriate parser class based on the file
        """
        options = []
        for parser in self.parsers:
            result = parser(doc)
            if result:
                options.append(result)
        self.log(
            "info",
            "Parsers available: {}".format(
                ", ".join([str(o["parser"].__name__) for o in options])
            )
        )
        if not options:
            return None
        # Return the parser with the highest weight.
        return sorted(
            options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
    def _store(self, text, doc, thumbnail, date):
        file_info = FileInfo.from_path(doc)
@@ -211,10 +179,9 @@ class Consumer:
        self._write(document, doc, document.source_path)
        self._write(document, thumbnail, document.thumbnail_path)
        #TODO: why do we need to save the document again?
        document.save()
        self.log("debug", "Completed")
        return document
    def _write(self, document, source, target):
--- a/src/documents/management/commands/document_rerun_ocr.py
+++ b/src/documents/management/commands/document_rerun_ocr.py
@@ -0,0 +1,60 @@
 import argparse
 import threading
 from multiprocessing import Pool
 from multiprocessing.pool import ThreadPool
 from django.core.management.base import BaseCommand
 from documents.consumer import Consumer
 from documents.models import Log, Document
 from documents.parsers import get_parser_class
 def process_document(doc):
    parser_class = get_parser_class(doc.file_name)
    if not parser_class:
        print("no parser available")
    else:
        print("Parser: {}".format(parser_class.__name__))
        parser = parser_class(doc.source_path, None)
        try:
            text = parser.get_text()
            doc.content = text
            doc.save()
        finally:
            parser.cleanup()
 def document_index(value):
    ivalue = int(value)
    if not (1 <= ivalue <= Document.objects.count()):
        raise argparse.ArgumentTypeError(
            "{} is not a valid document index (out of range)".format(value))
    return ivalue
 class Command(BaseCommand):
    help = "Performs OCR on all documents again!"
    def add_arguments(self, parser):
        parser.add_argument(
            "-s", "--start_index",
            default=None,
            type=document_index
        )
    def handle(self, *args, **options):
        docs = Document.objects.all().order_by("added")
        indices = range(options['start_index']-1, len(docs)) if options['start_index'] else range(len(docs))
        for i in indices:
            doc = docs[i]
            print("==================================")
            print("{} out of {}: {}".format(i+1, len(docs), doc.file_name))
            print("==================================")
            process_document(doc)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -20,6 +20,8 @@ from django.utils import timezone
 # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 # - MONTH ZZZZ, with ZZZZ being 4 digits
 # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 from documents.signals import document_consumer_declaration
 DATE_REGEX = re.compile(
    r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +  # NOQA: E501
    r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +  # NOQA: E501
@@ -32,6 +34,31 @@ DATE_REGEX = re.compile(
 logger = logging.getLogger(__name__)
 def get_parser_class(doc):
    """
    Determine the appropriate parser class based on the file
    """
    parsers = []
    for response in document_consumer_declaration.send(None):
        parsers.append(response[1])
    #TODO: add a check that checks parser availability.
    options = []
    for parser in parsers:
        result = parser(doc)
        if result:
            options.append(result)
    if not options:
        return None
    # Return the parser with the highest weight.
    return sorted(
        options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 def run_convert(input, output, density=None, scale=None, alpha=None, strip=False, trim=False, type=None, depth=None, extra=None, logging_group=None):
    environment = os.environ.copy()
    if settings.CONVERT_MEMORY_LIMIT: