Fixes the creation of an archive file, even if noarchive was specified

2026-02-07 23:42:46 -06:00 · 2022-08-20 13:47:56 -07:00
parent 0878a199f4
commit b3b2519bf0
2 changed files with 37 additions and 2 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -276,7 +276,10 @@ class RasterisedDocumentParser(DocumentParser):
            self.log("debug", f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)

-            self.archive_path = archive_path
+            # Only create archive file if archiving isn't being skipped
+            if settings.OCR_MODE != "skip_noarchive":
+                self.archive_path = archive_path
+
            self.text = self.extract_text(sidecar_file, archive_path)

            if not self.text:
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -364,6 +364,16 @@ class TestParser(DirectoriesMixin, TestCase):

    @override_settings(OCR_MODE="skip_noarchive")
    def test_skip_noarchive_withtext(self):
+        """
+        GIVEN:
+            - File with existing text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
@@ -377,17 +387,29 @@ class TestParser(DirectoriesMixin, TestCase):

    @override_settings(OCR_MODE="skip_noarchive")
    def test_skip_noarchive_notext(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )

+        self.assertIsNone(parser.archive_path)
+
    @override_settings(OCR_MODE="skip")
    def test_multi_page_mixed(self):
        parser = RasterisedDocumentParser(None)
@@ -408,6 +430,16 @@ class TestParser(DirectoriesMixin, TestCase):

    @override_settings(OCR_MODE="skip_noarchive")
    def test_multi_page_mixed_no_archive(self):
+        """
+        GIVEN:
+            - File with some text contained in images and some in text layer
+            - OCR mode set to skip_noarchive
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - No archive file is created
+        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),