Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

This commit is contained in:
Brandon Rothweiler 2023-02-23 22:42:57 -05:00
parent 8a89f5ae27
commit ca412e0184
8 changed files with 185 additions and 14 deletions

View File

@ -415,12 +415,6 @@ modes are available:
- `skip`: Paperless skips all pages and will perform ocr only on
pages where no text is present. This is the safest option.
- `skip_noarchive`: In addition to skip, paperless won't create
an archived version of your documents when it finds any text in
them. This is useful if you don't want to have two
almost-identical versions of your digital documents in the media
folder. This is the fastest option.
- `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This
will be useful for documents from scanners that already
@ -443,6 +437,19 @@ modes are available:
Read more about this in the [OCRmyPDF
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`
: Specify when you would like paperless to skip creating an archived
version of your documents. This is useful if you don't want to have two
almost-identical versions of your documents in the media folder.
- `never`: Never skip creating an archived version.
- `with_text`: Skip creating an archived version for documents
that already have embedded text.
- `always`: Always skip creating an archived version.
The default is `never`.
`PAPERLESS_OCR_CLEAN=<mode>`
: Tells paperless to use `unpaper` to clean any input document before

View File

@ -818,9 +818,10 @@ performance immensely:
other tasks).
- Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider
OCR'ing your documents before feeding them into paperless. Some
scanners are able to do this! You might want to even specify
`skip_noarchive` to skip archive file generation for already ocr'ed
documents entirely.
scanners are able to do this!
- Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive
file generation for already ocr'ed documents, or `always` to skip it
for all documents.
- If you want to perform OCR on the device, consider using
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
less memory at the expense of slightly worse OCR results.

View File

@ -60,8 +60,8 @@ following operations on your documents:
This process can be configured to fit your needs. If you don't want
paperless to create archived versions for digital documents, you can
configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`.
Please read the
configure that by configuring
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
[relevant section in the documentation](/configuration#ocr).
!!! note

View File

@ -42,6 +42,7 @@
#PAPERLESS_OCR_LANGUAGE=eng
#PAPERLESS_OCR_MODE=skip
#PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
#PAPERLESS_OCR_PAGES=1
#PAPERLESS_OCR_IMAGE_DPI=300

View File

@ -130,6 +130,23 @@ def settings_values_check(app_configs, **kwargs):
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_MODE == "skip_noarchive":
msgs.append(
Warning(
'OCR output mode "skip_noarchive" is deprecated and will be'
"removed in a future version. Please use"
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
),
)
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
msgs.append(
Error(
"OCR_SKIP_ARCHIVE_FILE setting "
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
),
)
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
return msgs

View File

@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
# skip. redo, force
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")

View File

@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive,
# we're done here
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
skip_archive_for_text = (
settings.OCR_MODE == "skip_noarchive"
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
)
if skip_archive_for_text and original_has_text:
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
@ -320,6 +324,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if settings.OCR_SKIP_ARCHIVE_FILE != "always":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
["page 1", "page 2", "page 3"],
)
@override_settings(OOCR_MODE="skip")
@override_settings(OCR_MODE="skip")
def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(
@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
"""