mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Add a setting to disable creating an archive file
This commit is contained in:
		@@ -419,7 +419,10 @@ modes are available:
 | 
				
			|||||||
        an archived version of your documents when it finds any text in
 | 
					        an archived version of your documents when it finds any text in
 | 
				
			||||||
        them. This is useful if you don't want to have two
 | 
					        them. This is useful if you don't want to have two
 | 
				
			||||||
        almost-identical versions of your digital documents in the media
 | 
					        almost-identical versions of your digital documents in the media
 | 
				
			||||||
        folder. This is the fastest option.
 | 
					        folder.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    -   `skip_neverarchive`: In addition to skip, paperless will never
 | 
				
			||||||
 | 
					        create an archive version of your documents. This is the fastest option.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    -   `redo`: Paperless will OCR all pages of your documents and
 | 
					    -   `redo`: Paperless will OCR all pages of your documents and
 | 
				
			||||||
        attempt to replace any existing text layers with new text. This
 | 
					        attempt to replace any existing text layers with new text. This
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -127,7 +127,13 @@ def settings_values_check(app_configs, **kwargs):
 | 
				
			|||||||
                Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
 | 
					                Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
 | 
					        if settings.OCR_MODE not in {
 | 
				
			||||||
 | 
					            "force",
 | 
				
			||||||
 | 
					            "skip",
 | 
				
			||||||
 | 
					            "redo",
 | 
				
			||||||
 | 
					            "skip_noarchive",
 | 
				
			||||||
 | 
					            "skip_neverarchive",
 | 
				
			||||||
 | 
					        }:
 | 
				
			||||||
            msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
 | 
					            msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
 | 
					        if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        if settings.OCR_MODE == "force" or safe_fallback:
 | 
					        if settings.OCR_MODE == "force" or safe_fallback:
 | 
				
			||||||
            ocrmypdf_args["force_ocr"] = True
 | 
					            ocrmypdf_args["force_ocr"] = True
 | 
				
			||||||
        elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
 | 
					        elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
 | 
				
			||||||
            ocrmypdf_args["skip_text"] = True
 | 
					            ocrmypdf_args["skip_text"] = True
 | 
				
			||||||
        elif settings.OCR_MODE == "redo":
 | 
					        elif settings.OCR_MODE == "redo":
 | 
				
			||||||
            ocrmypdf_args["redo_ocr"] = True
 | 
					            ocrmypdf_args["redo_ocr"] = True
 | 
				
			||||||
@@ -294,7 +294,10 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        # If the original has text, and the user doesn't want an archive,
 | 
					        # If the original has text, and the user doesn't want an archive,
 | 
				
			||||||
        # we're done here
 | 
					        # we're done here
 | 
				
			||||||
        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
 | 
					        if (
 | 
				
			||||||
 | 
					            settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
 | 
				
			||||||
 | 
					            and original_has_text
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
 | 
					            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
 | 
				
			||||||
            self.text = text_original
 | 
					            self.text = text_original
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
@@ -320,6 +323,8 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            self.log("debug", f"Calling OCRmyPDF with args: {args}")
 | 
					            self.log("debug", f"Calling OCRmyPDF with args: {args}")
 | 
				
			||||||
            ocrmypdf.ocr(**args)
 | 
					            ocrmypdf.ocr(**args)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Only create archive file if archiving isn't being skipped
 | 
				
			||||||
 | 
					            if settings.OCR_MODE != "skip_neverarchive":
 | 
				
			||||||
                self.archive_path = archive_path
 | 
					                self.archive_path = archive_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.text = self.extract_text(sidecar_file, archive_path)
 | 
					            self.text = self.extract_text(sidecar_file, archive_path)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -438,6 +438,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self.assertIsNotNone(parser.archive_path)
 | 
					        self.assertIsNotNone(parser.archive_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @override_settings(OCR_MODE="skip_neverarchive")
 | 
				
			||||||
 | 
					    def test_skip_neverarchive_withtext(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - File with existing text layer
 | 
				
			||||||
 | 
					            - OCR mode set to skip_neverarchive
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - Document is parsed
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Text from images is extracted
 | 
				
			||||||
 | 
					            - No archive file is created
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
 | 
					        parser.parse(
 | 
				
			||||||
 | 
					            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
 | 
				
			||||||
 | 
					            "application/pdf",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        self.assertIsNone(parser.archive_path)
 | 
				
			||||||
 | 
					        self.assertContainsStrings(
 | 
				
			||||||
 | 
					            parser.get_text().lower(),
 | 
				
			||||||
 | 
					            ["page 1", "page 2", "page 3"],
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @override_settings(OCR_MODE="skip_neverarchive")
 | 
				
			||||||
 | 
					    def test_skip_neverarchive_notext(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - File with text contained in images but no text layer
 | 
				
			||||||
 | 
					            - OCR mode set to skip_neverarchive
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - Document is parsed
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Text from images is extracted
 | 
				
			||||||
 | 
					            - No archive file is created
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
 | 
					        parser.parse(
 | 
				
			||||||
 | 
					            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
 | 
				
			||||||
 | 
					            "application/pdf",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        self.assertIsNone(parser.archive_path)
 | 
				
			||||||
 | 
					        self.assertContainsStrings(
 | 
				
			||||||
 | 
					            parser.get_text().lower(),
 | 
				
			||||||
 | 
					            ["page 1", "page 2", "page 3"],
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @override_settings(OCR_MODE="skip")
 | 
					    @override_settings(OCR_MODE="skip")
 | 
				
			||||||
    def test_multi_page_mixed(self):
 | 
					    def test_multi_page_mixed(self):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user