mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Add a setting to disable creating an archive file
This commit is contained in:
		| @@ -127,7 +127,13 @@ def settings_values_check(app_configs, **kwargs): | ||||
|                 Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), | ||||
|             ) | ||||
|  | ||||
|         if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: | ||||
|         if settings.OCR_MODE not in { | ||||
|             "force", | ||||
|             "skip", | ||||
|             "redo", | ||||
|             "skip_noarchive", | ||||
|             "skip_neverarchive", | ||||
|         }: | ||||
|             msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) | ||||
|  | ||||
|         if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: | ||||
|   | ||||
| @@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         if settings.OCR_MODE == "force" or safe_fallback: | ||||
|             ocrmypdf_args["force_ocr"] = True | ||||
|         elif settings.OCR_MODE in ["skip", "skip_noarchive"]: | ||||
|         elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]: | ||||
|             ocrmypdf_args["skip_text"] = True | ||||
|         elif settings.OCR_MODE == "redo": | ||||
|             ocrmypdf_args["redo_ocr"] = True | ||||
| @@ -294,7 +294,10 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         # If the original has text, and the user doesn't want an archive, | ||||
|         # we're done here | ||||
|         if settings.OCR_MODE == "skip_noarchive" and original_has_text: | ||||
|         if ( | ||||
|             settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"] | ||||
|             and original_has_text | ||||
|         ): | ||||
|             self.log("debug", "Document has text, skipping OCRmyPDF entirely.") | ||||
|             self.text = text_original | ||||
|             return | ||||
| @@ -320,7 +323,9 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             self.log("debug", f"Calling OCRmyPDF with args: {args}") | ||||
|             ocrmypdf.ocr(**args) | ||||
|  | ||||
|             self.archive_path = archive_path | ||||
|             # Only create archive file if archiving isn't being skipped | ||||
|             if settings.OCR_MODE != "skip_neverarchive": | ||||
|                 self.archive_path = archive_path | ||||
|  | ||||
|             self.text = self.extract_text(sidecar_file, archive_path) | ||||
|  | ||||
|   | ||||
| @@ -438,6 +438,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): | ||||
|  | ||||
|         self.assertIsNotNone(parser.archive_path) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_neverarchive") | ||||
|     def test_skip_neverarchive_withtext(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - File with existing text layer | ||||
|             - OCR mode set to skip_neverarchive | ||||
|         WHEN: | ||||
|             - Document is parsed | ||||
|         THEN: | ||||
|             - Text from images is extracted | ||||
|             - No archive file is created | ||||
|         """ | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), | ||||
|             "application/pdf", | ||||
|         ) | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), | ||||
|             ["page 1", "page 2", "page 3"], | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_neverarchive") | ||||
|     def test_skip_neverarchive_notext(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - File with text contained in images but no text layer | ||||
|             - OCR mode set to skip_neverarchive | ||||
|         WHEN: | ||||
|             - Document is parsed | ||||
|         THEN: | ||||
|             - Text from images is extracted | ||||
|             - No archive file is created | ||||
|         """ | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), | ||||
|             "application/pdf", | ||||
|         ) | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), | ||||
|             ["page 1", "page 2", "page 3"], | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip") | ||||
|     def test_multi_page_mixed(self): | ||||
|         """ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Brandon Rothweiler
					Brandon Rothweiler