Reverts the change around skip_noarchive to align with how it is documented to work

2025-12-20 01:45:58 -06:00 · 2022-10-19 11:37:47 -07:00
parent 53e8d84af2
commit d1aa08850d
2 changed files with 44 additions and 7 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -249,16 +249,22 @@ class RasterisedDocumentParser(DocumentParser):

        if mime_type == "application/pdf":
            text_original = self.extract_text(None, document_path)
-            original_has_text = text_original and len(text_original) > 50
+            original_has_text = text_original is not None and len(text_original) > 50
        else:
            text_original = None
            original_has_text = False

+        # If the original has text, and the user doesn't want an archive,
+        # we're done here
        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return

+        # Either no text was in the original or there should be an archive
+        # file created, so OCR the file and create an archive with any
+        # test located via OCR
+
        import ocrmypdf
        from ocrmypdf import InputFileError, EncryptedPdfError

@@ -276,9 +282,7 @@ class RasterisedDocumentParser(DocumentParser):
            self.log("debug", f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)

-            # Only create archive file if archiving isn't being skipped
-            if settings.OCR_MODE != "skip_noarchive":
-                self.archive_path = archive_path
+            self.archive_path = archive_path

            self.text = self.extract_text(sidecar_file, archive_path)