From d1aa08850d066952194bee59042f11edb93e348c Mon Sep 17 00:00:00 2001 From: Trenton Holmes Date: Wed, 19 Oct 2022 11:37:47 -0700 Subject: [PATCH] Reverts the change around skip_noarchive to align with how it is documented to work --- src/paperless_tesseract/parsers.py | 12 ++++-- src/paperless_tesseract/tests/test_parser.py | 39 ++++++++++++++++++-- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index abb3d3dfe..405df07ce 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -249,16 +249,22 @@ class RasterisedDocumentParser(DocumentParser): if mime_type == "application/pdf": text_original = self.extract_text(None, document_path) - original_has_text = text_original and len(text_original) > 50 + original_has_text = text_original is not None and len(text_original) > 50 else: text_original = None original_has_text = False + # If the original has text, and the user doesn't want an archive, + # we're done here if settings.OCR_MODE == "skip_noarchive" and original_has_text: self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return + # Either no text was in the original or there should be an archive + # file created, so OCR the file and create an archive with any + # test located via OCR + import ocrmypdf from ocrmypdf import InputFileError, EncryptedPdfError @@ -276,9 +282,7 @@ class RasterisedDocumentParser(DocumentParser): self.log("debug", f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - # Only create archive file if archiving isn't being skipped - if settings.OCR_MODE != "skip_noarchive": - self.archive_path = archive_path + self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 700782a92..858cc7701 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -341,6 +341,17 @@ class TestParser(DirectoriesMixin, TestCase): @override_settings(OCR_PAGES=2, OCR_MODE="redo") def test_multi_page_analog_pages_redo(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR of only pages 1 and 2 requested + - OCR mode set to redo + WHEN: + - Document is parsed + THEN: + - Text of page 1 and 2 extracted + - An archive file is created + """ parser = RasterisedDocumentParser(None) parser.parse( os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), @@ -352,6 +363,17 @@ class TestParser(DirectoriesMixin, TestCase): @override_settings(OCR_PAGES=1, OCR_MODE="force") def test_multi_page_analog_pages_force(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR of only page 1 requested + - OCR mode set to force + WHEN: + - Document is parsed + THEN: + - Only text of page 1 is extracted + - An archive file is created + """ parser = RasterisedDocumentParser(None) parser.parse( os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), @@ -395,7 +417,7 @@ class TestParser(DirectoriesMixin, TestCase): - Document is parsed THEN: - Text from images is extracted - - No archive file is created + - An archive file is created with the OCRd text """ parser = RasterisedDocumentParser(None) parser.parse( @@ -408,15 +430,26 @@ class TestParser(DirectoriesMixin, TestCase): ["page 1", "page 2", "page 3"], ) - self.assertIsNone(parser.archive_path) + self.assertIsNotNone(parser.archive_path) @override_settings(OCR_MODE="skip") def test_multi_page_mixed(self): + """ + GIVEN: + - File with some text contained in images and some in text layer + - OCR mode set to skip + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - An archive file is created with the OCRd text and the original text + """ parser = RasterisedDocumentParser(None) parser.parse( os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), "application/pdf", ) + self.assertIsNotNone(parser.archive_path) self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings( parser.get_text().lower(), @@ -438,7 +471,7 @@ class TestParser(DirectoriesMixin, TestCase): - Document is parsed THEN: - Text from images is extracted - - No archive file is created + - No archive file is created as original file contains text """ parser = RasterisedDocumentParser(None) parser.parse(