diff --git a/docs/configuration.md b/docs/configuration.md index 6c233c2e6..5cf0022f3 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -419,7 +419,10 @@ modes are available: an archived version of your documents when it finds any text in them. This is useful if you don't want to have two almost-identical versions of your digital documents in the media - folder. This is the fastest option. + folder. + + - `skip_neverarchive`: In addition to skip, paperless will never + create an archive version of your documents. This is the fastest option. - `redo`: Paperless will OCR all pages of your documents and attempt to replace any existing text layers with new text. This diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 845ff2d0b..53972bc21 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -127,7 +127,13 @@ def settings_values_check(app_configs, **kwargs): Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), ) - if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: + if settings.OCR_MODE not in { + "force", + "skip", + "redo", + "skip_noarchive", + "skip_neverarchive", + }: msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 4227583f8..3a91e3390 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser): if settings.OCR_MODE == "force" or safe_fallback: ocrmypdf_args["force_ocr"] = True - elif settings.OCR_MODE in ["skip", "skip_noarchive"]: + elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]: ocrmypdf_args["skip_text"] = True elif settings.OCR_MODE == "redo": ocrmypdf_args["redo_ocr"] = True @@ -294,7 +294,10 @@ class RasterisedDocumentParser(DocumentParser): # If the original has text, and the user doesn't want an archive, # we're done here - if settings.OCR_MODE == "skip_noarchive" and original_has_text: + if ( + settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"] + and original_has_text + ): self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return @@ -320,7 +323,9 @@ class RasterisedDocumentParser(DocumentParser): self.log("debug", f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - self.archive_path = archive_path + # Only create archive file if archiving isn't being skipped + if settings.OCR_MODE != "skip_neverarchive": + self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 94b72a0ee..de0c3ce38 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -438,6 +438,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertIsNotNone(parser.archive_path) + @override_settings(OCR_MODE="skip_neverarchive") + def test_skip_neverarchive_withtext(self): + """ + GIVEN: + - File with existing text layer + - OCR mode set to skip_neverarchive + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - No archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), + "application/pdf", + ) + self.assertIsNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_MODE="skip_neverarchive") + def test_skip_neverarchive_notext(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR mode set to skip_neverarchive + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - No archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), + "application/pdf", + ) + self.assertIsNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + @override_settings(OCR_MODE="skip") def test_multi_page_mixed(self): """