From ca412e018462fb4e327e94df19d80f4d965ad64d Mon Sep 17 00:00:00 2001 From: Brandon Rothweiler Date: Thu, 23 Feb 2023 22:42:57 -0500 Subject: [PATCH] Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting --- docs/configuration.md | 19 ++- docs/setup.md | 7 +- docs/usage.md | 4 +- paperless.conf.example | 1 + src/paperless/checks.py | 17 +++ src/paperless/settings.py | 2 + src/paperless_tesseract/parsers.py | 9 +- src/paperless_tesseract/tests/test_parser.py | 140 ++++++++++++++++++- 8 files changed, 185 insertions(+), 14 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 6c233c2e6..d3b391f1a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -415,12 +415,6 @@ modes are available: - `skip`: Paperless skips all pages and will perform ocr only on pages where no text is present. This is the safest option. - - `skip_noarchive`: In addition to skip, paperless won't create - an archived version of your documents when it finds any text in - them. This is useful if you don't want to have two - almost-identical versions of your digital documents in the media - folder. This is the fastest option. - - `redo`: Paperless will OCR all pages of your documents and attempt to replace any existing text layers with new text. This will be useful for documents from scanners that already @@ -443,6 +437,19 @@ modes are available: Read more about this in the [OCRmyPDF documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped). +`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=` + +: Specify when you would like paperless to skip creating an archived +version of your documents. This is useful if you don't want to have two +almost-identical versions of your documents in the media folder. + + - `never`: Never skip creating an archived version. + - `with_text`: Skip creating an archived version for documents + that already have embedded text. + - `always`: Always skip creating an archived version. + + The default is `never`. + `PAPERLESS_OCR_CLEAN=` : Tells paperless to use `unpaper` to clean any input document before diff --git a/docs/setup.md b/docs/setup.md index 7eaaf69f7..425448ff6 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -818,9 +818,10 @@ performance immensely: other tasks). - Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider OCR'ing your documents before feeding them into paperless. Some - scanners are able to do this! You might want to even specify - `skip_noarchive` to skip archive file generation for already ocr'ed - documents entirely. + scanners are able to do this! +- Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive + file generation for already ocr'ed documents, or `always` to skip it + for all documents. - If you want to perform OCR on the device, consider using `PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use less memory at the expense of slightly worse OCR results. diff --git a/docs/usage.md b/docs/usage.md index e162e6e3a..14adef26b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -60,8 +60,8 @@ following operations on your documents: This process can be configured to fit your needs. If you don't want paperless to create archived versions for digital documents, you can - configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`. - Please read the + configure that by configuring + `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the [relevant section in the documentation](/configuration#ocr). !!! note diff --git a/paperless.conf.example b/paperless.conf.example index 524d9264e..6bd70697e 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -42,6 +42,7 @@ #PAPERLESS_OCR_LANGUAGE=eng #PAPERLESS_OCR_MODE=skip +#PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never #PAPERLESS_OCR_OUTPUT_TYPE=pdfa #PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_IMAGE_DPI=300 diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 845ff2d0b..8988798a0 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -130,6 +130,23 @@ def settings_values_check(app_configs, **kwargs): if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) + if settings.OCR_MODE == "skip_noarchive": + msgs.append( + Warning( + 'OCR output mode "skip_noarchive" is deprecated and will be' + "removed in a future version. Please use" + "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.", + ), + ) + + if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}: + msgs.append( + Error( + "OCR_SKIP_ARCHIVE_FILE setting " + f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid', + ), + ) + if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid')) return msgs diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 41f08f3e2..44e843a9c 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") # skip. redo, force OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") +OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never") + OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 4227583f8..bbb25feb9 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser): # If the original has text, and the user doesn't want an archive, # we're done here - if settings.OCR_MODE == "skip_noarchive" and original_has_text: + skip_archive_for_text = ( + settings.OCR_MODE == "skip_noarchive" + or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] + ) + if skip_archive_for_text and original_has_text: self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return @@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser): self.log("debug", f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - self.archive_path = archive_path + if settings.OCR_SKIP_ARCHIVE_FILE != "always": + self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 94b72a0ee..5cbbc4d55 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ["page 1", "page 2", "page 3"], ) - @override_settings(OOCR_MODE="skip") + @override_settings(OCR_MODE="skip") def test_multi_page_analog_pages_skip(self): parser = RasterisedDocumentParser(None) parser.parse( @@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertIsNotNone(parser.archive_path) + @override_settings(OCR_SKIP_ARCHIVE_FILE="never") + def test_skip_archive_never_withtext(self): + """ + GIVEN: + - File with existing text layer + - OCR_SKIP_ARCHIVE_FILE set to never + WHEN: + - Document is parsed + THEN: + - Text from text layer is extracted + - Archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), + "application/pdf", + ) + self.assertIsNotNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="never") + def test_skip_archive_never_withimages(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR_SKIP_ARCHIVE_FILE set to never + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - Archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), + "application/pdf", + ) + self.assertIsNotNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text") + def test_skip_archive_withtext_withtext(self): + """ + GIVEN: + - File with existing text layer + - OCR_SKIP_ARCHIVE_FILE set to with_text + WHEN: + - Document is parsed + THEN: + - Text from text layer is extracted + - No archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), + "application/pdf", + ) + self.assertIsNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text") + def test_skip_archive_withtext_withimages(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR_SKIP_ARCHIVE_FILE set to with_text + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - Archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), + "application/pdf", + ) + self.assertIsNotNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="always") + def test_skip_archive_always_withtext(self): + """ + GIVEN: + - File with existing text layer + - OCR_SKIP_ARCHIVE_FILE set to always + WHEN: + - Document is parsed + THEN: + - Text from text layer is extracted + - No archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), + "application/pdf", + ) + self.assertIsNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + + @override_settings(OCR_SKIP_ARCHIVE_FILE="always") + def test_skip_archive_always_withimages(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR_SKIP_ARCHIVE_FILE set to always + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - No archive file is created + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), + "application/pdf", + ) + self.assertIsNone(parser.archive_path) + self.assertContainsStrings( + parser.get_text().lower(), + ["page 1", "page 2", "page 3"], + ) + @override_settings(OCR_MODE="skip") def test_multi_page_mixed(self): """