mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting
This commit is contained in:
parent
8a89f5ae27
commit
ca412e0184
@ -415,12 +415,6 @@ modes are available:
|
|||||||
- `skip`: Paperless skips all pages and will perform ocr only on
|
- `skip`: Paperless skips all pages and will perform ocr only on
|
||||||
pages where no text is present. This is the safest option.
|
pages where no text is present. This is the safest option.
|
||||||
|
|
||||||
- `skip_noarchive`: In addition to skip, paperless won't create
|
|
||||||
an archived version of your documents when it finds any text in
|
|
||||||
them. This is useful if you don't want to have two
|
|
||||||
almost-identical versions of your digital documents in the media
|
|
||||||
folder. This is the fastest option.
|
|
||||||
|
|
||||||
- `redo`: Paperless will OCR all pages of your documents and
|
- `redo`: Paperless will OCR all pages of your documents and
|
||||||
attempt to replace any existing text layers with new text. This
|
attempt to replace any existing text layers with new text. This
|
||||||
will be useful for documents from scanners that already
|
will be useful for documents from scanners that already
|
||||||
@ -443,6 +437,19 @@ modes are available:
|
|||||||
Read more about this in the [OCRmyPDF
|
Read more about this in the [OCRmyPDF
|
||||||
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
|
documentation](https://ocrmypdf.readthedocs.io/en/latest/advanced.html#when-ocr-is-skipped).
|
||||||
|
|
||||||
|
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=<mode>`
|
||||||
|
|
||||||
|
: Specify when you would like paperless to skip creating an archived
|
||||||
|
version of your documents. This is useful if you don't want to have two
|
||||||
|
almost-identical versions of your documents in the media folder.
|
||||||
|
|
||||||
|
- `never`: Never skip creating an archived version.
|
||||||
|
- `with_text`: Skip creating an archived version for documents
|
||||||
|
that already have embedded text.
|
||||||
|
- `always`: Always skip creating an archived version.
|
||||||
|
|
||||||
|
The default is `never`.
|
||||||
|
|
||||||
`PAPERLESS_OCR_CLEAN=<mode>`
|
`PAPERLESS_OCR_CLEAN=<mode>`
|
||||||
|
|
||||||
: Tells paperless to use `unpaper` to clean any input document before
|
: Tells paperless to use `unpaper` to clean any input document before
|
||||||
|
@ -818,9 +818,10 @@ performance immensely:
|
|||||||
other tasks).
|
other tasks).
|
||||||
- Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider
|
- Keep `PAPERLESS_OCR_MODE` at its default value `skip` and consider
|
||||||
OCR'ing your documents before feeding them into paperless. Some
|
OCR'ing your documents before feeding them into paperless. Some
|
||||||
scanners are able to do this! You might want to even specify
|
scanners are able to do this!
|
||||||
`skip_noarchive` to skip archive file generation for already ocr'ed
|
- Set `PAPERLESS_OCR_SKIP_ARCHIVE_FILE` to `with_text` to skip archive
|
||||||
documents entirely.
|
file generation for already ocr'ed documents, or `always` to skip it
|
||||||
|
for all documents.
|
||||||
- If you want to perform OCR on the device, consider using
|
- If you want to perform OCR on the device, consider using
|
||||||
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
|
`PAPERLESS_OCR_CLEAN=none`. This will speed up OCR times and use
|
||||||
less memory at the expense of slightly worse OCR results.
|
less memory at the expense of slightly worse OCR results.
|
||||||
|
@ -60,8 +60,8 @@ following operations on your documents:
|
|||||||
|
|
||||||
This process can be configured to fit your needs. If you don't want
|
This process can be configured to fit your needs. If you don't want
|
||||||
paperless to create archived versions for digital documents, you can
|
paperless to create archived versions for digital documents, you can
|
||||||
configure that by configuring `PAPERLESS_OCR_MODE=skip_noarchive`.
|
configure that by configuring
|
||||||
Please read the
|
`PAPERLESS_OCR_SKIP_ARCHIVE_FILE=with_text`. Please read the
|
||||||
[relevant section in the documentation](/configuration#ocr).
|
[relevant section in the documentation](/configuration#ocr).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
|
@ -42,6 +42,7 @@
|
|||||||
|
|
||||||
#PAPERLESS_OCR_LANGUAGE=eng
|
#PAPERLESS_OCR_LANGUAGE=eng
|
||||||
#PAPERLESS_OCR_MODE=skip
|
#PAPERLESS_OCR_MODE=skip
|
||||||
|
#PAPERLESS_OCR_SKIP_ARCHIVE_FILE=never
|
||||||
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
|
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
|
||||||
#PAPERLESS_OCR_PAGES=1
|
#PAPERLESS_OCR_PAGES=1
|
||||||
#PAPERLESS_OCR_IMAGE_DPI=300
|
#PAPERLESS_OCR_IMAGE_DPI=300
|
||||||
|
@ -130,6 +130,23 @@ def settings_values_check(app_configs, **kwargs):
|
|||||||
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
|
||||||
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
|
||||||
|
|
||||||
|
if settings.OCR_MODE == "skip_noarchive":
|
||||||
|
msgs.append(
|
||||||
|
Warning(
|
||||||
|
'OCR output mode "skip_noarchive" is deprecated and will be'
|
||||||
|
"removed in a future version. Please use"
|
||||||
|
"PAPERLESS_OCR_SKIP_ARCHIVE_FILE instead.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if settings.OCR_SKIP_ARCHIVE_FILE not in {"never", "with_text", "always"}:
|
||||||
|
msgs.append(
|
||||||
|
Error(
|
||||||
|
"OCR_SKIP_ARCHIVE_FILE setting "
|
||||||
|
f'"{settings.OCR_SKIP_ARCHIVE_FILE}" is not valid',
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
|
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
|
||||||
msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
|
msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
|
||||||
return msgs
|
return msgs
|
||||||
|
@ -725,6 +725,8 @@ OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
|||||||
# skip. redo, force
|
# skip. redo, force
|
||||||
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||||
|
|
||||||
|
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
|
||||||
|
|
||||||
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
|
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
|
||||||
|
|
||||||
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")
|
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")
|
||||||
|
@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
# If the original has text, and the user doesn't want an archive,
|
# If the original has text, and the user doesn't want an archive,
|
||||||
# we're done here
|
# we're done here
|
||||||
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
|
skip_archive_for_text = (
|
||||||
|
settings.OCR_MODE == "skip_noarchive"
|
||||||
|
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
|
||||||
|
)
|
||||||
|
if skip_archive_for_text and original_has_text:
|
||||||
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
|
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
|
||||||
self.text = text_original
|
self.text = text_original
|
||||||
return
|
return
|
||||||
@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.log("debug", f"Calling OCRmyPDF with args: {args}")
|
self.log("debug", f"Calling OCRmyPDF with args: {args}")
|
||||||
ocrmypdf.ocr(**args)
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
self.archive_path = archive_path
|
if settings.OCR_SKIP_ARCHIVE_FILE != "always":
|
||||||
|
self.archive_path = archive_path
|
||||||
|
|
||||||
self.text = self.extract_text(sidecar_file, archive_path)
|
self.text = self.extract_text(sidecar_file, archive_path)
|
||||||
|
|
||||||
|
@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
["page 1", "page 2", "page 3"],
|
["page 1", "page 2", "page 3"],
|
||||||
)
|
)
|
||||||
|
|
||||||
@override_settings(OOCR_MODE="skip")
|
@override_settings(OCR_MODE="skip")
|
||||||
def test_multi_page_analog_pages_skip(self):
|
def test_multi_page_analog_pages_skip(self):
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(
|
parser.parse(
|
||||||
@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
|
|
||||||
self.assertIsNotNone(parser.archive_path)
|
self.assertIsNotNone(parser.archive_path)
|
||||||
|
|
||||||
|
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
|
||||||
|
def test_skip_archive_never_withtext(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with existing text layer
|
||||||
|
- OCR_SKIP_ARCHIVE_FILE set to never
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from text layer is extracted
|
||||||
|
- Archive file is created
|
||||||
|
"""
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
parser.parse(
|
||||||
|
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
self.assertIsNotNone(parser.archive_path)
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.get_text().lower(),
|
||||||
|
["page 1", "page 2", "page 3"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
|
||||||
|
def test_skip_archive_never_withimages(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with text contained in images but no text layer
|
||||||
|
- OCR_SKIP_ARCHIVE_FILE set to never
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from images is extracted
|
||||||
|
- Archive file is created
|
||||||
|
"""
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
parser.parse(
|
||||||
|
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
self.assertIsNotNone(parser.archive_path)
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.get_text().lower(),
|
||||||
|
["page 1", "page 2", "page 3"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
|
||||||
|
def test_skip_archive_withtext_withtext(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with existing text layer
|
||||||
|
- OCR_SKIP_ARCHIVE_FILE set to with_text
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from text layer is extracted
|
||||||
|
- No archive file is created
|
||||||
|
"""
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
parser.parse(
|
||||||
|
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
self.assertIsNone(parser.archive_path)
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.get_text().lower(),
|
||||||
|
["page 1", "page 2", "page 3"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
|
||||||
|
def test_skip_archive_withtext_withimages(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with text contained in images but no text layer
|
||||||
|
- OCR_SKIP_ARCHIVE_FILE set to with_text
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from images is extracted
|
||||||
|
- Archive file is created
|
||||||
|
"""
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
parser.parse(
|
||||||
|
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
self.assertIsNotNone(parser.archive_path)
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.get_text().lower(),
|
||||||
|
["page 1", "page 2", "page 3"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
|
||||||
|
def test_skip_archive_always_withtext(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with existing text layer
|
||||||
|
- OCR_SKIP_ARCHIVE_FILE set to always
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from text layer is extracted
|
||||||
|
- No archive file is created
|
||||||
|
"""
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
parser.parse(
|
||||||
|
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
self.assertIsNone(parser.archive_path)
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.get_text().lower(),
|
||||||
|
["page 1", "page 2", "page 3"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
|
||||||
|
def test_skip_archive_always_withimages(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with text contained in images but no text layer
|
||||||
|
- OCR_SKIP_ARCHIVE_FILE set to always
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from images is extracted
|
||||||
|
- No archive file is created
|
||||||
|
"""
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
parser.parse(
|
||||||
|
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
self.assertIsNone(parser.archive_path)
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.get_text().lower(),
|
||||||
|
["page 1", "page 2", "page 3"],
|
||||||
|
)
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip")
|
@override_settings(OCR_MODE="skip")
|
||||||
def test_multi_page_mixed(self):
|
def test_multi_page_mixed(self):
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user