mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-01 11:19:32 -05:00
Revert "Merge pull request #2732 from bdr99/skip_neverarchive"
This reverts commit 77b23d3acb573232e4e307b63a83f8ff557c0e7e, reversing changes made to 5d8aa278315dcf92bfa1abe9e1fbd4911f8ed258.
This commit is contained in:
parent
39b2375f8a
commit
d49e7d6693
@ -419,10 +419,7 @@ modes are available:
|
|||||||
an archived version of your documents when it finds any text in
|
an archived version of your documents when it finds any text in
|
||||||
them. This is useful if you don't want to have two
|
them. This is useful if you don't want to have two
|
||||||
almost-identical versions of your digital documents in the media
|
almost-identical versions of your digital documents in the media
|
||||||
folder.
|
folder. This is the fastest option.
|
||||||
|
|
||||||
- `skip_neverarchive`: In addition to skip, paperless will never
|
|
||||||
create an archive version of your documents. This is the fastest option.
|
|
||||||
|
|
||||||
- `redo`: Paperless will OCR all pages of your documents and
|
- `redo`: Paperless will OCR all pages of your documents and
|
||||||
attempt to replace any existing text layers with new text. This
|
attempt to replace any existing text layers with new text. This
|
||||||
|
@ -127,13 +127,7 @@ def settings_values_check(app_configs, **kwargs):
|
|||||||
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
|
||||||
)
|
)
|
||||||
|
|
||||||
if settings.OCR_MODE not in {
|
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
||||||
"force",
|
|
||||||
"skip",
|
|
||||||
"redo",
|
|
||||||
"skip_noarchive",
|
|
||||||
"skip_neverarchive",
|
|
||||||
}:
|
|
||||||
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
||||||
|
|
||||||
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
|
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
|
||||||
|
@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
if settings.OCR_MODE == "force" or safe_fallback:
|
if settings.OCR_MODE == "force" or safe_fallback:
|
||||||
ocrmypdf_args["force_ocr"] = True
|
ocrmypdf_args["force_ocr"] = True
|
||||||
elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
|
elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
|
||||||
ocrmypdf_args["skip_text"] = True
|
ocrmypdf_args["skip_text"] = True
|
||||||
elif settings.OCR_MODE == "redo":
|
elif settings.OCR_MODE == "redo":
|
||||||
ocrmypdf_args["redo_ocr"] = True
|
ocrmypdf_args["redo_ocr"] = True
|
||||||
@ -294,10 +294,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
# If the original has text, and the user doesn't want an archive,
|
# If the original has text, and the user doesn't want an archive,
|
||||||
# we're done here
|
# we're done here
|
||||||
if (
|
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
|
||||||
settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
|
|
||||||
and original_has_text
|
|
||||||
):
|
|
||||||
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
|
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
|
||||||
self.text = text_original
|
self.text = text_original
|
||||||
return
|
return
|
||||||
@ -323,9 +320,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.log("debug", f"Calling OCRmyPDF with args: {args}")
|
self.log("debug", f"Calling OCRmyPDF with args: {args}")
|
||||||
ocrmypdf.ocr(**args)
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
# Only create archive file if archiving isn't being skipped
|
self.archive_path = archive_path
|
||||||
if settings.OCR_MODE != "skip_neverarchive":
|
|
||||||
self.archive_path = archive_path
|
|
||||||
|
|
||||||
self.text = self.extract_text(sidecar_file, archive_path)
|
self.text = self.extract_text(sidecar_file, archive_path)
|
||||||
|
|
||||||
|
@ -438,52 +438,6 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
|
|
||||||
self.assertIsNotNone(parser.archive_path)
|
self.assertIsNotNone(parser.archive_path)
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip_neverarchive")
|
|
||||||
def test_skip_neverarchive_withtext(self):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with existing text layer
|
|
||||||
- OCR mode set to skip_neverarchive
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- No archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip_neverarchive")
|
|
||||||
def test_skip_neverarchive_notext(self):
|
|
||||||
"""
|
|
||||||
GIVEN:
|
|
||||||
- File with text contained in images but no text layer
|
|
||||||
- OCR mode set to skip_neverarchive
|
|
||||||
WHEN:
|
|
||||||
- Document is parsed
|
|
||||||
THEN:
|
|
||||||
- Text from images is extracted
|
|
||||||
- No archive file is created
|
|
||||||
"""
|
|
||||||
parser = RasterisedDocumentParser(None)
|
|
||||||
parser.parse(
|
|
||||||
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
self.assertIsNone(parser.archive_path)
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.get_text().lower(),
|
|
||||||
["page 1", "page 2", "page 3"],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip")
|
@override_settings(OCR_MODE="skip")
|
||||||
def test_multi_page_mixed(self):
|
def test_multi_page_mixed(self):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user