Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

2025-07-24 18:04:39 -05:00 · 2023-02-23 22:42:57 -05:00 · 2023-02-23 22:42:57 -05:00 · ca412e0184
commit ca412e0184
parent 8a89f5ae27
8 changed files with 185 additions and 14 deletions
--- a/docs/configuration.md
+++ b/docs/configuration.md
@ -415,12 +415,6 @@ modes are available:
    -   `skip`: Paperless skips all pages and will perform ocr only on
        pages where no text is present. This is the safest option.
    -   `skip_noarchive`: In addition to skip, paperless won't create
        an archived version of your documents when it finds any text in
        them. This is useful if you don't want to have two
        almost-identical versions of your digital documents in the media
        folder. This is the fastest option.
    -   `redo`: Paperless will OCR all pages of your documents and
        attempt to replace any existing text layers with new text. This
        will be useful for documents from scanners that already
@ -443,6 +437,19 @@ modes are available:
    Read more about this in the [OCRmyPDF
    documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
 `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`
 : Specify when you would like paperless to skip creating an archived
 version of your documents. This is useful if you don't want to have two
 almost-identical versions of your documents in the media folder.
    -   `never`: Never skip creating an archived version.
    -   `with_text`: Skip creating an archived version for documents
    that already have embedded text.
    -   `always`: Always skip creating an archived version.
    The default is `never`.
 `PAPERLESS_OCR_CLEAN=<mode>`
 : Tells paperless to use `unpaper` to clean any input document before
--- a/docs/setup.md
+++ b/docs/setup.md
@ -818,9 +818,10 @@ performance immensely:
  other tasks).
 - Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider
  OCR'ing your documents before feeding them into paperless. Some
-  scanners are able to do this! You might want to even specify
+  scanners are able to do this!
-  `skip_noarchive` to skip archive file generation for already ocr'ed
+- Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive
-  documents entirely.
+  file generation for already ocr'ed documents, or `always` to skip it
  for all documents.
 - If you want to perform OCR on the device, consider using
  `PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
  less memory at the expense of slightly worse OCR results.
--- a/docs/usage.md
+++ b/docs/usage.md
@ -60,8 +60,8 @@ following operations on your documents:
    This process can be configured to fit your needs. If you don't want
    paperless to create archived versions for digital documents, you can
-    configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`.
+    configure that by configuring
-    Please read the
+    `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
    [relevant section in the documentation](/configuration#ocr).
 !!! note
--- a/paperless.conf.example
+++ b/paperless.conf.example
@ -42,6 +42,7 @@
 #PAPERLESS_OCR_LANGUAGE=eng
 #PAPERLESS_OCR_MODE=skip
 #PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never
 #PAPERLESS_OCR_OUTPUT_TYPE=pdfa
 #PAPERLESS_OCR_PAGES=1
 #PAPERLESS_OCR_IMAGE_DPI=300
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@ -130,6 +130,23 @@ def settings_values_check(app_configs, **kwargs):
        if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
            msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
        if settings.OCR_MODE == "skip_noarchive":
            msgs.append(
                Warning(
                    'OCR output mode "skip_noarchive" is deprecated and will be'
                    "removed in a future version. Please use"
                    "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
                ),
            )
        if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
            msgs.append(
                Error(
                    "OCR_SKIP_ARCHIVE_FILE setting "
                    f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
                ),
            )
        if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
            msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
        return msgs
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
 # skip. redo, force
 OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
 OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
 OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
 OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):
        # If the original has text, and the user doesn't want an archive,
        # we're done here
-        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
+        skip_archive_for_text = (
            settings.OCR_MODE == "skip_noarchive"
            or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
        )
        if skip_archive_for_text and original_has_text:
            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return
@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser):
            self.log("debug", f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)
-            self.archive_path = archive_path
+            if settings.OCR_SKIP_ARCHIVE_FILE != "always":
                self.archive_path = archive_path
            self.text = self.extract_text(sidecar_file, archive_path)
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
            ["page 1", "page 2", "page 3"],
        )
-    @override_settings(OOCR_MODE="skip")
+    @override_settings(OCR_MODE="skip")
    def test_multi_page_analog_pages_skip(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(
@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        self.assertIsNotNone(parser.archive_path)
    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
    def test_skip_archive_never_withtext(self):
        """
        GIVEN:
            - File with existing text layer
            - OCR_SKIP_ARCHIVE_FILE set to never
        WHEN:
            - Document is parsed
        THEN:
            - Text from text layer is extracted
            - Archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )
    @override_settings(OCR_SKIP_ARCHIVE_FILE="never")
    def test_skip_archive_never_withimages(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR_SKIP_ARCHIVE_FILE set to never
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - Archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )
    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
    def test_skip_archive_withtext_withtext(self):
        """
        GIVEN:
            - File with existing text layer
            - OCR_SKIP_ARCHIVE_FILE set to with_text
        WHEN:
            - Document is parsed
        THEN:
            - Text from text layer is extracted
            - No archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )
    @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
    def test_skip_archive_withtext_withimages(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR_SKIP_ARCHIVE_FILE set to with_text
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - Archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )
    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
    def test_skip_archive_always_withtext(self):
        """
        GIVEN:
            - File with existing text layer
            - OCR_SKIP_ARCHIVE_FILE set to always
        WHEN:
            - Document is parsed
        THEN:
            - Text from text layer is extracted
            - No archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
            "application/pdf",
        )
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )
    @override_settings(OCR_SKIP_ARCHIVE_FILE="always")
    def test_skip_archive_always_withimages(self):
        """
        GIVEN:
            - File with text contained in images but no text layer
            - OCR_SKIP_ARCHIVE_FILE set to always
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - No archive file is created
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
            "application/pdf",
        )
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
        )
    @override_settings(OCR_MODE="skip")
    def test_multi_page_mixed(self):
        """