Merge pull request #2732 from bdr99/skip_neverarchive

Feature: Add a setting to disable creating archive files
This commit is contained in:
shamoon 2023-02-23 15:22:49 -08:00 committed by GitHub
commit 77b23d3acb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 65 additions and 5 deletions

View File

@ -419,7 +419,10 @@ modes are available:
an archived version of your documents when it finds any text in an archived version of your documents when it finds any text in
them. This is useful if you don't want to have two them. This is useful if you don't want to have two
almost-identical versions of your digital documents in the media almost-identical versions of your digital documents in the media
folder. This is the fastest option. folder.
- `skip_neverarchive`: In addition to skip, paperless will never
create an archive version of your documents. This is the fastest option.
- `redo`: Paperless will OCR all pages of your documents and - `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This attempt to replace any existing text layers with new text. This

View File

@ -127,7 +127,13 @@ def settings_values_check(app_configs, **kwargs):
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
) )
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: if settings.OCR_MODE not in {
"force",
"skip",
"redo",
"skip_noarchive",
"skip_neverarchive",
}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:

View File

@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser):
if settings.OCR_MODE == "force" or safe_fallback: if settings.OCR_MODE == "force" or safe_fallback:
ocrmypdf_args["force_ocr"] = True ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive"]: elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
ocrmypdf_args["skip_text"] = True ocrmypdf_args["skip_text"] = True
elif settings.OCR_MODE == "redo": elif settings.OCR_MODE == "redo":
ocrmypdf_args["redo_ocr"] = True ocrmypdf_args["redo_ocr"] = True
@ -294,7 +294,10 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive, # If the original has text, and the user doesn't want an archive,
# we're done here # we're done here
if settings.OCR_MODE == "skip_noarchive" and original_has_text: if (
settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
and original_has_text
):
self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original self.text = text_original
return return
@ -320,7 +323,9 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}") self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args) ocrmypdf.ocr(**args)
self.archive_path = archive_path # Only create archive file if archiving isn't being skipped
if settings.OCR_MODE != "skip_neverarchive":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path) self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -438,6 +438,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_MODE="skip_neverarchive")
def test_skip_neverarchive_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_neverarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip_neverarchive")
def test_skip_neverarchive_notext(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_neverarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip") @override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self): def test_multi_page_mixed(self):
""" """