Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

2026-01-26 22:49:01 -06:00 · 2023-02-23 22:42:57 -05:00
parent 8a89f5ae27
commit ca412e0184
8 changed files with 185 additions and 14 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):

        # If the original has text, and the user doesn't want an archive,
        # we're done here
-        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
+        skip_archive_for_text = (
+            settings.OCR_MODE == "skip_noarchive"
+            or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
+        )
+        if skip_archive_for_text and original_has_text:
            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return
@@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser):
            self.log("debug", f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)

-            self.archive_path = archive_path
+            if settings.OCR_SKIP_ARCHIVE_FILE != "always":
+                self.archive_path = archive_path

            self.text = self.extract_text(sidecar_file, archive_path)

--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
            ["page 1", "page 2", "page 3"],
        )

-    @override_settings(OOCR_MODE="skip")
+    @override_settings(OCR_MODE="skip")
    def test_multi_page_analog_pages_skip(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
@@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        self.assertIsNotNone(parser.archive_path)

+    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
+    def test_skip_archive_never_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to never
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
+    def test_skip_archive_never_withimages(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to never
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
+    def test_skip_archive_withtext_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to with_text
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
+    def test_skip_archive_withtext_withimages(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to with_text
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - Archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNotNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
+    def test_skip_archive_always_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR_SKIP_ARCHIVE_FILE set to always
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from text layer is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
+    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
+    def test_skip_archive_always_withimages(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR_SKIP_ARCHIVE_FILE set to always
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
+        parser = RasterisedDocumentParser(None)
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
+            "application/pdf",
+        )
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(
+            parser.get_text().lower(),
+            ["page 1", "page 2", "page 3"],
+        )
+
    @override_settings(OCR_MODE="skip")
    def test_multi_page_mixed(self):
        """