Add a setting to disable creating an archive file

This commit is contained in:
Brandon Rothweiler 2023-02-22 15:27:17 -05:00
parent 782db3f324
commit 93a6391f96
4 changed files with 65 additions and 5 deletions

View File

@ -419,7 +419,10 @@ modes are available:
an archived version of your documents when it finds any text in
them. This is useful if you don't want to have two
almost-identical versions of your digital documents in the media
folder. This is the fastest option.
folder.
- `skip_neverarchive`: In addition to skip, paperless will never
create an archive version of your documents. This is the fastest option.
- `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This

View File

@ -127,7 +127,13 @@ def settings_values_check(app_configs, **kwargs):
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
)
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
if settings.OCR_MODE not in {
"force",
"skip",
"redo",
"skip_noarchive",
"skip_neverarchive",
}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:

View File

@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser):
if settings.OCR_MODE == "force" or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
ocrmypdf_args["skip_text"] = True
elif settings.OCR_MODE == "redo":
ocrmypdf_args["redo_ocr"] = True
@ -294,7 +294,10 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive,
# we're done here
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
if (
settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
and original_has_text
):
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
@ -320,7 +323,9 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
self.archive_path = archive_path
# Only create archive file if archiving isn't being skipped
if settings.OCR_MODE != "skip_neverarchive":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -438,6 +438,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_MODE="skip_neverarchive")
def test_skip_neverarchive_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_neverarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip_neverarchive")
def test_skip_neverarchive_notext(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_neverarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
"""