Don't use the sidecar file when redoing the OCR, it only contains new text

2025-12-24 02:05:48 -06:00 · 2022-11-21 14:45:20 -08:00
parent 54f20b381e
commit b897d6de2e
1 changed files with 12 additions and 4 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -95,7 +95,13 @@ class RasterisedDocumentParser(DocumentParser):
            return None
    def extract_text(self, sidecar_file, pdf_file):
-        if sidecar_file and os.path.isfile(sidecar_file):
+        # When re-doing OCR, the sidecar contains ONLY the new text, not
        # the whole text, so do not utilize it in that case
        if (
            sidecar_file is not None
            and os.path.isfile(sidecar_file)
            and settings.OCR_MODE != "redo"
        ):
            with open(sidecar_file) as f:
                text = f.read()
@@ -142,7 +148,7 @@ class RasterisedDocumentParser(DocumentParser):
            "input_file": input_file,
            "output_file": output_file,
            # need to use threads, since this will be run in daemonized
-            # processes by django-q.
+            # processes via the task library.
            "use_threads": True,
            "jobs": settings.THREADS_PER_WORKER,
            "language": settings.OCR_LANGUAGE,
@@ -165,9 +171,11 @@ class RasterisedDocumentParser(DocumentParser):
            if settings.OCR_MODE == "redo":
                ocrmypdf_args["clean"] = True
            else:
                # --clean-final is not compatible with --redo-ocr
                ocrmypdf_args["clean_final"] = True
-        if settings.OCR_DESKEW and not settings.OCR_MODE == "redo":
+        if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
            # --deskew is not compatible with --redo-ocr
            ocrmypdf_args["deskew"] = True
        if settings.OCR_ROTATE_PAGES:
@@ -263,7 +271,7 @@ class RasterisedDocumentParser(DocumentParser):
        # Either no text was in the original or there should be an archive
        # file created, so OCR the file and create an archive with any
-        # test located via OCR
+        # text located via OCR
        import ocrmypdf
        from ocrmypdf import InputFileError, EncryptedPdfError