Merge pull request #2743 from bdr99/ocr_skip_archive_file

Feature: Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting
This commit is contained in:
Jonas Winkler 2023-02-27 11:01:54 +01:00 committed by GitHub
commit 4133001c73
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 182 additions and 35 deletions

View File

@ -415,15 +415,6 @@ modes are available:
- `skip`: Paperless skips all pages and will perform ocr only on
pages where no text is present. This is the safest option.
- `skip_noarchive`: In addition to skip, paperless won't create
an archived version of your documents when it finds any text in
them. This is useful if you don't want to have two
almost-identical versions of your digital documents in the media
folder.
- `skip_neverarchive`: In addition to skip, paperless will never
create an archive version of your documents. This is the fastest option.
- `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This
will be useful for documents from scanners that already
@ -446,6 +437,19 @@ modes are available:
Read more about this in the [OCRmyPDF
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`
: Specify when you would like paperless to skip creating an archived
version of your documents. This is useful if you don't want to have two
almost-identical versions of your documents in the media folder.
- `never`: Never skip creating an archived version.
- `with_text`: Skip creating an archived version for documents
that already have embedded text.
- `always`: Always skip creating an archived version.
The default is `never`.
`PAPERLESS_OCR_CLEAN=<mode>`
: Tells paperless to use `unpaper` to clean any input document before

View File

@ -818,9 +818,10 @@ performance immensely:
other tasks).
- Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider
OCR'ing your documents before feeding them into paperless. Some
scanners are able to do this! You might want to even specify
`skip_noarchive` to skip archive file generation for already ocr'ed
documents entirely.
scanners are able to do this!
- Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive
file generation for already ocr'ed documents, or `always` to skip it
for all documents.
- If you want to perform OCR on the device, consider using
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
less memory at the expense of slightly worse OCR results.

View File

@ -60,8 +60,8 @@ following operations on your documents:
This process can be configured to fit your needs. If you don't want
paperless to create archived versions for digital documents, you can
configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`.
Please read the
configure that by configuring
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
[relevant section in the documentation](/configuration#ocr).
!!! note

View File

@ -42,6 +42,7 @@
#PAPERLESS_OCR_LANGUAGE=eng
#PAPERLESS_OCR_MODE=skip
#PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
#PAPERLESS_OCR_PAGES=1
#PAPERLESS_OCR_IMAGE_DPI=300

View File

@ -127,15 +127,26 @@ def settings_values_check(app_configs, **kwargs):
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
)
if settings.OCR_MODE not in {
"force",
"skip",
"redo",
"skip_noarchive",
"skip_neverarchive",
}:
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_MODE == "skip_noarchive":
msgs.append(
Warning(
'OCR output mode "skip_noarchive" is deprecated and will be '
"removed in a future version. Please use "
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
),
)
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
msgs.append(
Error(
"OCR_SKIP_ARCHIVE_FILE setting "
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
),
)
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
return msgs

View File

@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
# skip. redo, force
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")

View File

@ -105,6 +105,42 @@ class TestSettingsChecks(DirectoriesMixin, TestCase):
self.assertIn('OCR output mode "makeitso"', msg.msg)
@override_settings(OCR_MODE="skip_noarchive")
def test_deprecated_ocr_type(self):
"""
GIVEN:
- Default settings
- OCR type is deprecated
WHEN:
- Settings are validated
THEN:
- deprecation warning reported for OCR type
"""
msgs = settings_values_check(None)
self.assertEqual(len(msgs), 1)
msg = msgs[0]
self.assertIn("deprecated", msg.msg)
@override_settings(OCR_SKIP_ARCHIVE_FILE="invalid")
def test_invalid_ocr_skip_archive_file(self):
"""
GIVEN:
- Default settings
- OCR_SKIP_ARCHIVE_FILE is invalid
WHEN:
- Settings are validated
THEN:
- system check error reported for OCR_SKIP_ARCHIVE_FILE
"""
msgs = settings_values_check(None)
self.assertEqual(len(msgs), 1)
msg = msgs[0]
self.assertIn('OCR_SKIP_ARCHIVE_FILE setting "invalid"', msg.msg)
@override_settings(OCR_CLEAN="cleanme")
def test_invalid_ocr_clean(self):
"""

View File

@ -192,7 +192,7 @@ class RasterisedDocumentParser(DocumentParser):
if settings.OCR_MODE == "force" or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive", "skip_neverarchive"]:
elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
ocrmypdf_args["skip_text"] = True
elif settings.OCR_MODE == "redo":
ocrmypdf_args["redo_ocr"] = True
@ -294,10 +294,11 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive,
# we're done here
if (
settings.OCR_MODE in ["skip_noarchive", "skip_neverarchive"]
and original_has_text
):
skip_archive_for_text = (
settings.OCR_MODE == "skip_noarchive"
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
)
if skip_archive_for_text and original_has_text:
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
@ -323,8 +324,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
# Only create archive file if archiving isn't being skipped
if settings.OCR_MODE != "skip_neverarchive":
if settings.OCR_SKIP_ARCHIVE_FILE != "always":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
["page 1", "page 2", "page 3"],
)
@override_settings(OOCR_MODE="skip")
@override_settings(OCR_MODE="skip")
def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(
@ -438,16 +438,62 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_MODE="skip_neverarchive")
def test_skip_neverarchive_withtext(self):
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_neverarchive
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
@ -461,12 +507,58 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip_neverarchive")
def test_skip_neverarchive_notext(self):
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_neverarchive
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN: