diff --git a/docs/configuration.md b/docs/configuration.md index 5cf0022f3..d3b391f1a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -415,15 +415,6 @@ modes are available: - `skip`: Paperless skips all pages and will perform ocr only on pages where no text is present. This is the safest option. - - `skip_noarchive`: In addition to skip, paperless won't create - an archived version of your documents when it finds any text in - them. This is useful if you don't want to have two - almost-identical versions of your digital documents in the media - folder. - - - `skip_neverarchive`: In addition to skip, paperless will never - create an archive version of your documents. This is the fastest option. - - `redo`: Paperless will OCR all pages of your documents and attempt to replace any existing text layers with new text. This will be useful for documents from scanners that already @@ -446,6 +437,19 @@ modes are available: Read more about this in the [OCRmyPDF documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped). +`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=` + +: Specify when you would like paperless to skip creating an archived +version of your documents. This is useful if you don't want to have two +almost-identical versions of your documents in the media folder. + + - `never`: Never skip creating an archived version. + - `with_text`: Skip creating an archived version for documents + that already have embedded text. + - `always`: Always skip creating an archived version. + + The default is `never`. + `PAPERLESS_OCR_CLEAN=` : Tells paperless to use `unpaper` to clean any input document before diff --git a/docs/setup.md b/docs/setup.md index 7eaaf69f7..425448ff6 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -818,9 +818,10 @@ performance immensely: other tasks). - Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider OCR'ing your documents before feeding them into paperless. Some - scanners are able to do this! You might want to even specify - `skip_noarchive` to skip archive file generation for already ocr'ed - documents entirely. + scanners are able to do this! +- Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive + file generation for already ocr'ed documents, or `always` to skip it + for all documents. - If you want to perform OCR on the device, consider using `PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use less memory at the expense of slightly worse OCR results. diff --git a/docs/usage.md b/docs/usage.md index e162e6e3a..14adef26b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -60,8 +60,8 @@ following operations on your documents: This process can be configured to fit your needs. If you don't want paperless to create archived versions for digital documents, you can - configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`. - Please read the + configure that by configuring + `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the [relevant section in the documentation](/configuration#ocr). !!! note diff --git a/paperless.conf.example b/paperless.conf.example index 524d9264e..6bd70697e 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -42,6 +42,7 @@ #PAPERLESS_OCR_LANGUAGE=eng #PAPERLESS_OCR_MODE=skip +#PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never #PAPERLESS_OCR_OUTPUT_TYPE=pdfa #PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_IMAGE_DPI=300 diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 53972bc21..658ec9d31 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -127,15 +127,26 @@ def settings_values_check(app_configs, **kwargs): Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), ) - if settings.OCR_MODE not in { - "force", - "skip", - "redo", - "skip_noarchive", - "skip_neverarchive", - }: + if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) + if settings.OCR_MODE == "skip_noarchive": + msgs.append( + Warning( + 'OCR output mode "skip_noarchive" is deprecated and will be ' + "removed in a future version. Please use " + "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.", + ), + ) + + if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}: + msgs.append( + Error( + "OCR_SKIP_ARCHIVE_FILE setting " + f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid', + ), + ) + if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid')) return msgs diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 41f08f3e2..44e843a9c 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") # skip. redo, force OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") +OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never") + OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean") diff --git a/src/paperless/tests/test_checks.py b/src/paperless/tests/test_checks.py index b2d8b5810..3740d2f8a 100644 --- a/src/paperless/tests/test_checks.py +++ b/src/paperless/tests/test_checks.py @@ -105,6 +105,42 @@ class TestSettingsChecks(DirectoriesMixin, TestCase): self.assertIn('OCR output mode "makeitso"', msg.msg) + @override_settings(OCR_MODE="skip_noarchive") + def test_deprecated_ocr_type(self): + """ + GIVEN: + - Default settings + - OCR type is deprecated + WHEN: + - Settings are validated + THEN: + - deprecation warning reported for OCR type + """ + msgs = settings_values_check(None) + self.assertEqual(len(msgs), 1) + + msg = msgs[0] + + self.assertIn("deprecated", msg.msg) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="invalid") + def test_invalid_ocr_skip_archive_file(self): + """ + GIVEN: + - Default settings + - OCR_SKIP_ARCHIVE_FILE is invalid + WHEN: + - Settings are validated + THEN: + - system check error reported for OCR_SKIP_ARCHIVE_FILE + """ + msgs = settings_values_check(None) + self.assertEqual(len(msgs), 1) + + msg = msgs[0] + + self.assertIn('OCR_SKIP_ARCHIVE_FILE setting "invalid"', msg.msg) + @override_settings(OCR_CLEAN="cleanme") def test_invalid_ocr_clean(self): """ diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 3a91e3390..bbb25feb9 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser): if settings.OCR_MODE == "force" or safe_fallback: ocrmypdf_args["force_ocr"] = True - elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]: + elif settings.OCR_MODE in ["skip", "skip_noarchive"]: ocrmypdf_args["skip_text"] = True elif settings.OCR_MODE == "redo": ocrmypdf_args["redo_ocr"] = True @@ -294,10 +294,11 @@ class RasterisedDocumentParser(DocumentParser): # If the original has text, and the user doesn't want an archive, # we're done here - if ( - settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"] - and original_has_text - ): + skip_archive_for_text = ( + settings.OCR_MODE == "skip_noarchive" + or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] + ) + if skip_archive_for_text and original_has_text: self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return @@ -323,8 +324,7 @@ class RasterisedDocumentParser(DocumentParser): self.log("debug", f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - # Only create archive file if archiving isn't being skipped - if settings.OCR_MODE != "skip_neverarchive": + if settings.OCR_SKIP_ARCHIVE_FILE != "always": self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index de0c3ce38..5cbbc4d55 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ["page 1", "page 2", "page 3"], ) - @override_settings(OOCR_MODE="skip") + @override_settings(OCR_MODE="skip") def test_multi_page_analog_pages_skip(self): parser = RasterisedDocumentParser(None) parser.parse( @@ -438,16 +438,62 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertIsNotNone(parser.archive_path) - @override_settings(OCR_MODE="skip_neverarchive") - def test_skip_neverarchive_withtext(self): + @override_settings(OCR_SKIP_ARCHIVE_FILE="never") + def test_skip_archive_never_withtext(self): """ GIVEN: - File with existing text layer - - OCR mode set to skip_neverarchive + - OCR_SKIP_ARCHIVE_FILE set to never + WHEN: + - Document is parsed + THEN: + - Text from text layer is extracted + - Archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), + "application/pdf", + ) + self.assertIsNotNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="never") + def test_skip_archive_never_withimages(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR_SKIP_ARCHIVE_FILE set to never WHEN: - Document is parsed THEN: - Text from images is extracted + - Archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), + "application/pdf", + ) + self.assertIsNotNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text") + def test_skip_archive_withtext_withtext(self): + """ + GIVEN: + - File with existing text layer + - OCR_SKIP_ARCHIVE_FILE set to with_text + WHEN: + - Document is parsed + THEN: + - Text from text layer is extracted - No archive file is created """ parser = RasterisedDocumentParser(None) @@ -461,12 +507,58 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ["page 1", "page 2", "page 3"], ) - @override_settings(OCR_MODE="skip_neverarchive") - def test_skip_neverarchive_notext(self): + @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text") + def test_skip_archive_withtext_withimages(self): """ GIVEN: - File with text contained in images but no text layer - - OCR mode set to skip_neverarchive + - OCR_SKIP_ARCHIVE_FILE set to with_text + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - Archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), + "application/pdf", + ) + self.assertIsNotNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="always") + def test_skip_archive_always_withtext(self): + """ + GIVEN: + - File with existing text layer + - OCR_SKIP_ARCHIVE_FILE set to always + WHEN: + - Document is parsed + THEN: + - Text from text layer is extracted + - No archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), + "application/pdf", + ) + self.assertIsNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="always") + def test_skip_archive_always_withimages(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR_SKIP_ARCHIVE_FILE set to always WHEN: - Document is parsed THEN: