Revert "Merge pull request #2732 from bdr99/skip_neverarchive"

This reverts commit 77b23d3acb573232e4e307b63a83f8ff557c0e7e, reversing
changes made to 5d8aa278315dcf92bfa1abe9e1fbd4911f8ed258.
This commit is contained in:
Brandon Rothweiler 2023-02-23 21:26:53 -05:00
parent 39b2375f8a
commit d49e7d6693
4 changed files with 5 additions and 65 deletions

View File

@ -419,10 +419,7 @@ modes are available:
an archived version of your documents when it finds any text in an archived version of your documents when it finds any text in
them. This is useful if you don't want to have two them. This is useful if you don't want to have two
almost-identical versions of your digital documents in the media almost-identical versions of your digital documents in the media
folder. folder. This is the fastest option.
- `skip_neverarchive`: In addition to skip, paperless will never
create an archive version of your documents. This is the fastest option.
- `redo`: Paperless will OCR all pages of your documents and - `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This attempt to replace any existing text layers with new text. This

View File

@ -127,13 +127,7 @@ def settings_values_check(app_configs, **kwargs):
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'), Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
) )
if settings.OCR_MODE not in { if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
"force",
"skip",
"redo",
"skip_noarchive",
"skip_neverarchive",
}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:

View File

@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser):
if settings.OCR_MODE == "force" or safe_fallback: if settings.OCR_MODE == "force" or safe_fallback:
ocrmypdf_args["force_ocr"] = True ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]: elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
ocrmypdf_args["skip_text"] = True ocrmypdf_args["skip_text"] = True
elif settings.OCR_MODE == "redo": elif settings.OCR_MODE == "redo":
ocrmypdf_args["redo_ocr"] = True ocrmypdf_args["redo_ocr"] = True
@ -294,10 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive, # If the original has text, and the user doesn't want an archive,
# we're done here # we're done here
if ( if settings.OCR_MODE == "skip_noarchive" and original_has_text:
settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
and original_has_text
):
self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original self.text = text_original
return return
@ -323,9 +320,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}") self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args) ocrmypdf.ocr(**args)
# Only create archive file if archiving isn't being skipped self.archive_path = archive_path
if settings.OCR_MODE != "skip_neverarchive":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path) self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -438,52 +438,6 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_MODE="skip_neverarchive")
def test_skip_neverarchive_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_neverarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip_neverarchive")
def test_skip_neverarchive_notext(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_neverarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip") @override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self): def test_multi_page_mixed(self):
""" """