diff --git a/docs/configuration.md b/docs/configuration.md index 5cf0022f3..6c233c2e6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -419,10 +419,7 @@ modes are available: an archived version of your documents when it finds any text in them. This is useful if you don't want to have two almost-identical versions of your digital documents in the media - folder. - - - `skip_neverarchive`: In addition to skip, paperless will never - create an archive version of your documents. This is the fastest option. + folder. This is the fastest option. - `redo`: Paperless will OCR all pages of your documents and attempt to replace any existing text layers with new text. This diff --git a/src/paperless/checks.py b/src/paperless/checks.py index 53972bc21..845ff2d0b 100644 --- a/src/paperless/checks.py +++ b/src/paperless/checks.py @@ -127,13 +127,7 @@ def settings_values_check(app_configs, **kwargs): Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), ) - if settings.OCR_MODE not in { - "force", - "skip", - "redo", - "skip_noarchive", - "skip_neverarchive", - }: + if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 3a91e3390..4227583f8 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser): if settings.OCR_MODE == "force" or safe_fallback: ocrmypdf_args["force_ocr"] = True - elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]: + elif settings.OCR_MODE in ["skip", "skip_noarchive"]: ocrmypdf_args["skip_text"] = True elif settings.OCR_MODE == "redo": ocrmypdf_args["redo_ocr"] = True @@ -294,10 +294,7 @@ class RasterisedDocumentParser(DocumentParser): # If the original has text, and the user doesn't want an archive, # we're done here - if ( - settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"] - and original_has_text - ): + if settings.OCR_MODE == "skip_noarchive" and original_has_text: self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return @@ -323,9 +320,7 @@ class RasterisedDocumentParser(DocumentParser): self.log("debug", f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - # Only create archive file if archiving isn't being skipped - if settings.OCR_MODE != "skip_neverarchive": - self.archive_path = archive_path + self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index de0c3ce38..94b72a0ee 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -438,52 +438,6 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertIsNotNone(parser.archive_path) - @override_settings(OCR_MODE="skip_neverarchive") - def test_skip_neverarchive_withtext(self): - """ - GIVEN: - - File with existing text layer - - OCR mode set to skip_neverarchive - WHEN: - - Document is parsed - THEN: - - Text from images is extracted - - No archive file is created - """ - parser = RasterisedDocumentParser(None) - parser.parse( - os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), - "application/pdf", - ) - self.assertIsNone(parser.archive_path) - self.assertContainsStrings( - parser.get_text().lower(), - ["page 1", "page 2", "page 3"], - ) - - @override_settings(OCR_MODE="skip_neverarchive") - def test_skip_neverarchive_notext(self): - """ - GIVEN: - - File with text contained in images but no text layer - - OCR mode set to skip_neverarchive - WHEN: - - Document is parsed - THEN: - - Text from images is extracted - - No archive file is created - """ - parser = RasterisedDocumentParser(None) - parser.parse( - os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), - "application/pdf", - ) - self.assertIsNone(parser.archive_path) - self.assertContainsStrings( - parser.get_text().lower(), - ["page 1", "page 2", "page 3"], - ) - @override_settings(OCR_MODE="skip") def test_multi_page_mixed(self): """