From d1aa08850d066952194bee59042f11edb93e348c Mon Sep 17 00:00:00 2001
From: Trenton Holmes <holmes.trenton@gmail.com>
Date: Wed, 19 Oct 2022 11:37:47 -0700
Subject: [PATCH] Reverts the change around skip_noarchive to align with how it
 is documented to work

---
 src/paperless_tesseract/parsers.py           | 12 ++++--
 src/paperless_tesseract/tests/test_parser.py | 39 ++++++++++++++++++--
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index abb3d3dfe..405df07ce 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -249,16 +249,22 @@ class RasterisedDocumentParser(DocumentParser):
 
         if mime_type == "application/pdf":
             text_original = self.extract_text(None, document_path)
-            original_has_text = text_original and len(text_original) > 50
+            original_has_text = text_original is not None and len(text_original) > 50
         else:
             text_original = None
             original_has_text = False
 
+        # If the original has text, and the user doesn't want an archive,
+        # we're done here
         if settings.OCR_MODE == "skip_noarchive" and original_has_text:
             self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
             self.text = text_original
             return
 
+        # Either no text was in the original or there should be an archive
+        # file created, so OCR the file and create an archive with any
+        # test located via OCR
+
         import ocrmypdf
         from ocrmypdf import InputFileError, EncryptedPdfError
 
@@ -276,9 +282,7 @@ class RasterisedDocumentParser(DocumentParser):
             self.log("debug", f"Calling OCRmyPDF with args: {args}")
             ocrmypdf.ocr(**args)
 
-            # Only create archive file if archiving isn't being skipped
-            if settings.OCR_MODE != "skip_noarchive":
-                self.archive_path = archive_path
+            self.archive_path = archive_path
 
             self.text = self.extract_text(sidecar_file, archive_path)
 
diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py
index 700782a92..858cc7701 100644
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -341,6 +341,17 @@ class TestParser(DirectoriesMixin, TestCase):
 
     @override_settings(OCR_PAGES=2, OCR_MODE="redo")
     def test_multi_page_analog_pages_redo(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR of only pages 1 and 2 requested
+            - OCR mode set to redo
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text of page 1 and 2 extracted
+            - An archive file is created
+        """
         parser = RasterisedDocumentParser(None)
         parser.parse(
             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
@@ -352,6 +363,17 @@ class TestParser(DirectoriesMixin, TestCase):
 
     @override_settings(OCR_PAGES=1, OCR_MODE="force")
     def test_multi_page_analog_pages_force(self):
+        """
+        GIVEN:
+            - File with text contained in images but no text layer
+            - OCR of only page 1 requested
+            - OCR mode set to force
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Only text of page 1 is extracted
+            - An archive file is created
+        """
         parser = RasterisedDocumentParser(None)
         parser.parse(
             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
@@ -395,7 +417,7 @@ class TestParser(DirectoriesMixin, TestCase):
             - Document is parsed
         THEN:
             - Text from images is extracted
-            - No archive file is created
+            - An archive file is created with the OCRd text
         """
         parser = RasterisedDocumentParser(None)
         parser.parse(
@@ -408,15 +430,26 @@ class TestParser(DirectoriesMixin, TestCase):
             ["page 1", "page 2", "page 3"],
         )
 
-        self.assertIsNone(parser.archive_path)
+        self.assertIsNotNone(parser.archive_path)
 
     @override_settings(OCR_MODE="skip")
     def test_multi_page_mixed(self):
+        """
+        GIVEN:
+            - File with some text contained in images and some in text layer
+            - OCR mode set to skip
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from images is extracted
+            - An archive file is created with the OCRd text and the original text
+        """
         parser = RasterisedDocumentParser(None)
         parser.parse(
             os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
             "application/pdf",
         )
+        self.assertIsNotNone(parser.archive_path)
         self.assertTrue(os.path.isfile(parser.archive_path))
         self.assertContainsStrings(
             parser.get_text().lower(),
@@ -438,7 +471,7 @@ class TestParser(DirectoriesMixin, TestCase):
             - Document is parsed
         THEN:
             - Text from images is extracted
-            - No archive file is created
+            - No archive file is created as original file contains text
         """
         parser = RasterisedDocumentParser(None)
         parser.parse(