Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

This commit is contained in:
Brandon Rothweiler 2023-02-23 22:42:57 -05:00
parent 8a89f5ae27
commit ca412e0184
8 changed files with 185 additions and 14 deletions

View File

@ -415,12 +415,6 @@ modes are available:
- `skip`: Paperless skips all pages and will perform ocr only on - `skip`: Paperless skips all pages and will perform ocr only on
pages where no text is present. This is the safest option. pages where no text is present. This is the safest option.
- `skip_noarchive`: In addition to skip, paperless won't create
an archived version of your documents when it finds any text in
them. This is useful if you don't want to have two
almost-identical versions of your digital documents in the media
folder. This is the fastest option.
- `redo`: Paperless will OCR all pages of your documents and - `redo`: Paperless will OCR all pages of your documents and
attempt to replace any existing text layers with new text. This attempt to replace any existing text layers with new text. This
will be useful for documents from scanners that already will be useful for documents from scanners that already
@ -443,6 +437,19 @@ modes are available:
Read more about this in the [OCRmyPDF Read more about this in the [OCRmyPDF
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped). documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`
: Specify when you would like paperless to skip creating an archived
version of your documents. This is useful if you don't want to have two
almost-identical versions of your documents in the media folder.
- `never`: Never skip creating an archived version.
- `with_text`: Skip creating an archived version for documents
that already have embedded text.
- `always`: Always skip creating an archived version.
The default is `never`.
`PAPERLESS_OCR_CLEAN=<mode>` `PAPERLESS_OCR_CLEAN=<mode>`
: Tells paperless to use `unpaper` to clean any input document before : Tells paperless to use `unpaper` to clean any input document before

View File

@ -818,9 +818,10 @@ performance immensely:
other tasks). other tasks).
- Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider - Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider
OCR'ing your documents before feeding them into paperless. Some OCR'ing your documents before feeding them into paperless. Some
scanners are able to do this! You might want to even specify scanners are able to do this!
`skip_noarchive` to skip archive file generation for already ocr'ed - Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive
documents entirely. file generation for already ocr'ed documents, or `always` to skip it
for all documents.
- If you want to perform OCR on the device, consider using - If you want to perform OCR on the device, consider using
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use `PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
less memory at the expense of slightly worse OCR results. less memory at the expense of slightly worse OCR results.

View File

@ -60,8 +60,8 @@ following operations on your documents:
This process can be configured to fit your needs. If you don't want This process can be configured to fit your needs. If you don't want
paperless to create archived versions for digital documents, you can paperless to create archived versions for digital documents, you can
configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`. configure that by configuring
Please read the `PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
[relevant section in the documentation](/configuration#ocr). [relevant section in the documentation](/configuration#ocr).
!!! note !!! note

View File

@ -42,6 +42,7 @@
#PAPERLESS_OCR_LANGUAGE=eng #PAPERLESS_OCR_LANGUAGE=eng
#PAPERLESS_OCR_MODE=skip #PAPERLESS_OCR_MODE=skip
#PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa #PAPERLESS_OCR_OUTPUT_TYPE=pdfa
#PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_PAGES=1
#PAPERLESS_OCR_IMAGE_DPI=300 #PAPERLESS_OCR_IMAGE_DPI=300

View File

@ -130,6 +130,23 @@ def settings_values_check(app_configs, **kwargs):
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}: if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid')) msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_MODE == "skip_noarchive":
msgs.append(
Warning(
'OCR output mode "skip_noarchive" is deprecated and will be'
"removed in a future version. Please use"
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
),
)
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
msgs.append(
Error(
"OCR_SKIP_ARCHIVE_FILE setting "
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
),
)
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}: if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid')) msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
return msgs return msgs

View File

@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
# skip. redo, force # skip. redo, force
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean") OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")

View File

@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive, # If the original has text, and the user doesn't want an archive,
# we're done here # we're done here
if settings.OCR_MODE == "skip_noarchive" and original_has_text: skip_archive_for_text = (
settings.OCR_MODE == "skip_noarchive"
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
)
if skip_archive_for_text and original_has_text:
self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original self.text = text_original
return return
@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}") self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args) ocrmypdf.ocr(**args)
self.archive_path = archive_path if settings.OCR_SKIP_ARCHIVE_FILE != "always":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path) self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
["page 1", "page 2", "page 3"], ["page 1", "page 2", "page 3"],
) )
@override_settings(OOCR_MODE="skip") @override_settings(OCR_MODE="skip")
def test_multi_page_analog_pages_skip(self): def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
def test_skip_archive_never_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to never
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
def test_skip_archive_withtext_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to with_text
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from text layer is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
def test_skip_archive_always_withimages(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR_SKIP_ARCHIVE_FILE set to always
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
@override_settings(OCR_MODE="skip") @override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self): def test_multi_page_mixed(self):
""" """