mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-28 03:46:06 -05:00 
			
		
		
		
	Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting
This commit is contained in:
		| @@ -415,12 +415,6 @@ modes are available: | |||||||
|     -   `skip`: Paperless skips all pages and will perform ocr only on |     -   `skip`: Paperless skips all pages and will perform ocr only on | ||||||
|         pages where no text is present. This is the safest option. |         pages where no text is present. This is the safest option. | ||||||
|  |  | ||||||
|     -   `skip_noarchive`: In addition to skip, paperless won't create |  | ||||||
|         an archived version of your documents when it finds any text in |  | ||||||
|         them. This is useful if you don't want to have two |  | ||||||
|         almost-identical versions of your digital documents in the media |  | ||||||
|         folder. This is the fastest option. |  | ||||||
|  |  | ||||||
|     -   `redo`: Paperless will OCR all pages of your documents and |     -   `redo`: Paperless will OCR all pages of your documents and | ||||||
|         attempt to replace any existing text layers with new text. This |         attempt to replace any existing text layers with new text. This | ||||||
|         will be useful for documents from scanners that already |         will be useful for documents from scanners that already | ||||||
| @@ -443,6 +437,19 @@ modes are available: | |||||||
|     Read more about this in the [OCRmyPDF |     Read more about this in the [OCRmyPDF | ||||||
|     documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped). |     documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped). | ||||||
|  |  | ||||||
|  | `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>` | ||||||
|  |  | ||||||
|  | : Specify when you would like paperless to skip creating an archived | ||||||
|  | version of your documents. This is useful if you don't want to have two | ||||||
|  | almost-identical versions of your documents in the media folder. | ||||||
|  |  | ||||||
|  |     -   `never`: Never skip creating an archived version. | ||||||
|  |     -   `with_text`: Skip creating an archived version for documents | ||||||
|  |     that already have embedded text. | ||||||
|  |     -   `always`: Always skip creating an archived version. | ||||||
|  |  | ||||||
|  |     The default is `never`. | ||||||
|  |  | ||||||
| `PAPERLESS_OCR_CLEAN=<mode>` | `PAPERLESS_OCR_CLEAN=<mode>` | ||||||
|  |  | ||||||
| : Tells paperless to use `unpaper` to clean any input document before | : Tells paperless to use `unpaper` to clean any input document before | ||||||
|   | |||||||
| @@ -818,9 +818,10 @@ performance immensely: | |||||||
|   other tasks). |   other tasks). | ||||||
| - Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider | - Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider | ||||||
|   OCR'ing your documents before feeding them into paperless. Some |   OCR'ing your documents before feeding them into paperless. Some | ||||||
|   scanners are able to do this! You might want to even specify |   scanners are able to do this! | ||||||
|   `skip_noarchive` to skip archive file generation for already ocr'ed | - Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive | ||||||
|   documents entirely. |   file generation for already ocr'ed documents, or `always` to skip it | ||||||
|  |   for all documents. | ||||||
| - If you want to perform OCR on the device, consider using | - If you want to perform OCR on the device, consider using | ||||||
|   `PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use |   `PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use | ||||||
|   less memory at the expense of slightly worse OCR results. |   less memory at the expense of slightly worse OCR results. | ||||||
|   | |||||||
| @@ -60,8 +60,8 @@ following operations on your documents: | |||||||
|  |  | ||||||
|     This process can be configured to fit your needs. If you don't want |     This process can be configured to fit your needs. If you don't want | ||||||
|     paperless to create archived versions for digital documents, you can |     paperless to create archived versions for digital documents, you can | ||||||
|     configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`. |     configure that by configuring | ||||||
|     Please read the |     `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the | ||||||
|     [relevant section in the documentation](/configuration#ocr). |     [relevant section in the documentation](/configuration#ocr). | ||||||
|  |  | ||||||
| !!! note | !!! note | ||||||
|   | |||||||
| @@ -42,6 +42,7 @@ | |||||||
|  |  | ||||||
| #PAPERLESS_OCR_LANGUAGE=eng | #PAPERLESS_OCR_LANGUAGE=eng | ||||||
| #PAPERLESS_OCR_MODE=skip | #PAPERLESS_OCR_MODE=skip | ||||||
|  | #PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never | ||||||
| #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | ||||||
| #PAPERLESS_OCR_PAGES=1 | #PAPERLESS_OCR_PAGES=1 | ||||||
| #PAPERLESS_OCR_IMAGE_DPI=300 | #PAPERLESS_OCR_IMAGE_DPI=300 | ||||||
|   | |||||||
| @@ -130,6 +130,23 @@ def settings_values_check(app_configs, **kwargs): | |||||||
|         if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: |         if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: | ||||||
|             msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) |             msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) | ||||||
|  |  | ||||||
|  |         if settings.OCR_MODE == "skip_noarchive": | ||||||
|  |             msgs.append( | ||||||
|  |                 Warning( | ||||||
|  |                     'OCR output mode "skip_noarchive" is deprecated and will be' | ||||||
|  |                     "removed in a future version. Please use" | ||||||
|  |                     "PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.", | ||||||
|  |                 ), | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}: | ||||||
|  |             msgs.append( | ||||||
|  |                 Error( | ||||||
|  |                     "OCR_SKIP_ARCHIVE_FILE setting " | ||||||
|  |                     f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid', | ||||||
|  |                 ), | ||||||
|  |             ) | ||||||
|  |  | ||||||
|         if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: |         if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: | ||||||
|             msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid')) |             msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid')) | ||||||
|         return msgs |         return msgs | ||||||
|   | |||||||
| @@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") | |||||||
| # skip. redo, force | # skip. redo, force | ||||||
| OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | ||||||
|  |  | ||||||
|  | OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never") | ||||||
|  |  | ||||||
| OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | ||||||
|  |  | ||||||
| OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean") | OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean") | ||||||
|   | |||||||
| @@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|  |  | ||||||
|         # If the original has text, and the user doesn't want an archive, |         # If the original has text, and the user doesn't want an archive, | ||||||
|         # we're done here |         # we're done here | ||||||
|         if settings.OCR_MODE == "skip_noarchive" and original_has_text: |         skip_archive_for_text = ( | ||||||
|  |             settings.OCR_MODE == "skip_noarchive" | ||||||
|  |             or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] | ||||||
|  |         ) | ||||||
|  |         if skip_archive_for_text and original_has_text: | ||||||
|             self.log("debug", "Document has text, skipping OCRmyPDF entirely.") |             self.log("debug", "Document has text, skipping OCRmyPDF entirely.") | ||||||
|             self.text = text_original |             self.text = text_original | ||||||
|             return |             return | ||||||
| @@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|             self.log("debug", f"Calling OCRmyPDF with args: {args}") |             self.log("debug", f"Calling OCRmyPDF with args: {args}") | ||||||
|             ocrmypdf.ocr(**args) |             ocrmypdf.ocr(**args) | ||||||
|  |  | ||||||
|             self.archive_path = archive_path |             if settings.OCR_SKIP_ARCHIVE_FILE != "always": | ||||||
|  |                 self.archive_path = archive_path | ||||||
|  |  | ||||||
|             self.text = self.extract_text(sidecar_file, archive_path) |             self.text = self.extract_text(sidecar_file, archive_path) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): | |||||||
|             ["page 1", "page 2", "page 3"], |             ["page 1", "page 2", "page 3"], | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     @override_settings(OOCR_MODE="skip") |     @override_settings(OCR_MODE="skip") | ||||||
|     def test_multi_page_analog_pages_skip(self): |     def test_multi_page_analog_pages_skip(self): | ||||||
|         parser = RasterisedDocumentParser(None) |         parser = RasterisedDocumentParser(None) | ||||||
|         parser.parse( |         parser.parse( | ||||||
| @@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): | |||||||
|  |  | ||||||
|         self.assertIsNotNone(parser.archive_path) |         self.assertIsNotNone(parser.archive_path) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_SKIP_ARCHIVE_FILE="never") | ||||||
|  |     def test_skip_archive_never_withtext(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with existing text layer | ||||||
|  |             - OCR_SKIP_ARCHIVE_FILE set to never | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Text from text layer is extracted | ||||||
|  |             - Archive file is created | ||||||
|  |         """ | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse( | ||||||
|  |             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), | ||||||
|  |             "application/pdf", | ||||||
|  |         ) | ||||||
|  |         self.assertIsNotNone(parser.archive_path) | ||||||
|  |         self.assertContainsStrings( | ||||||
|  |             parser.get_text().lower(), | ||||||
|  |             ["page 1", "page 2", "page 3"], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_SKIP_ARCHIVE_FILE="never") | ||||||
|  |     def test_skip_archive_never_withimages(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with text contained in images but no text layer | ||||||
|  |             - OCR_SKIP_ARCHIVE_FILE set to never | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Text from images is extracted | ||||||
|  |             - Archive file is created | ||||||
|  |         """ | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse( | ||||||
|  |             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), | ||||||
|  |             "application/pdf", | ||||||
|  |         ) | ||||||
|  |         self.assertIsNotNone(parser.archive_path) | ||||||
|  |         self.assertContainsStrings( | ||||||
|  |             parser.get_text().lower(), | ||||||
|  |             ["page 1", "page 2", "page 3"], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text") | ||||||
|  |     def test_skip_archive_withtext_withtext(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with existing text layer | ||||||
|  |             - OCR_SKIP_ARCHIVE_FILE set to with_text | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Text from text layer is extracted | ||||||
|  |             - No archive file is created | ||||||
|  |         """ | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse( | ||||||
|  |             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), | ||||||
|  |             "application/pdf", | ||||||
|  |         ) | ||||||
|  |         self.assertIsNone(parser.archive_path) | ||||||
|  |         self.assertContainsStrings( | ||||||
|  |             parser.get_text().lower(), | ||||||
|  |             ["page 1", "page 2", "page 3"], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_SKIP_ARCHIVE_FILE="with_text") | ||||||
|  |     def test_skip_archive_withtext_withimages(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with text contained in images but no text layer | ||||||
|  |             - OCR_SKIP_ARCHIVE_FILE set to with_text | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Text from images is extracted | ||||||
|  |             - Archive file is created | ||||||
|  |         """ | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse( | ||||||
|  |             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), | ||||||
|  |             "application/pdf", | ||||||
|  |         ) | ||||||
|  |         self.assertIsNotNone(parser.archive_path) | ||||||
|  |         self.assertContainsStrings( | ||||||
|  |             parser.get_text().lower(), | ||||||
|  |             ["page 1", "page 2", "page 3"], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_SKIP_ARCHIVE_FILE="always") | ||||||
|  |     def test_skip_archive_always_withtext(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with existing text layer | ||||||
|  |             - OCR_SKIP_ARCHIVE_FILE set to always | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Text from text layer is extracted | ||||||
|  |             - No archive file is created | ||||||
|  |         """ | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse( | ||||||
|  |             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), | ||||||
|  |             "application/pdf", | ||||||
|  |         ) | ||||||
|  |         self.assertIsNone(parser.archive_path) | ||||||
|  |         self.assertContainsStrings( | ||||||
|  |             parser.get_text().lower(), | ||||||
|  |             ["page 1", "page 2", "page 3"], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_SKIP_ARCHIVE_FILE="always") | ||||||
|  |     def test_skip_archive_always_withimages(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File with text contained in images but no text layer | ||||||
|  |             - OCR_SKIP_ARCHIVE_FILE set to always | ||||||
|  |         WHEN: | ||||||
|  |             - Document is parsed | ||||||
|  |         THEN: | ||||||
|  |             - Text from images is extracted | ||||||
|  |             - No archive file is created | ||||||
|  |         """ | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse( | ||||||
|  |             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), | ||||||
|  |             "application/pdf", | ||||||
|  |         ) | ||||||
|  |         self.assertIsNone(parser.archive_path) | ||||||
|  |         self.assertContainsStrings( | ||||||
|  |             parser.get_text().lower(), | ||||||
|  |             ["page 1", "page 2", "page 3"], | ||||||
|  |         ) | ||||||
|  |  | ||||||
|     @override_settings(OCR_MODE="skip") |     @override_settings(OCR_MODE="skip") | ||||||
|     def test_multi_page_mixed(self): |     def test_multi_page_mixed(self): | ||||||
|         """ |         """ | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Brandon Rothweiler
					Brandon Rothweiler