mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting
This commit is contained in:
@@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
|
||||
skip_archive_for_text = (
|
||||
settings.OCR_MODE == "skip_noarchive"
|
||||
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
|
||||
)
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
@@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log("debug", f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
self.archive_path = archive_path
|
||||
if settings.OCR_SKIP_ARCHIVE_FILE != "always":
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
|
||||
|
@@ -332,7 +332,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OOCR_MODE="skip")
|
||||
@override_settings(OCR_MODE="skip")
|
||||
def test_multi_page_analog_pages_skip(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
@@ -438,6 +438,144 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
|
||||
def test_skip_archive_never_withtext(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to never
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from text layer is extracted
|
||||
- Archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="never")
|
||||
def test_skip_archive_never_withimages(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to never
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- Archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
|
||||
def test_skip_archive_withtext_withtext(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to with_text
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from text layer is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="with_text")
|
||||
def test_skip_archive_withtext_withimages(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to with_text
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- Archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNotNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
|
||||
def test_skip_archive_always_withtext(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to always
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from text layer is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_SKIP_ARCHIVE_FILE="always")
|
||||
def test_skip_archive_always_withimages(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR_SKIP_ARCHIVE_FILE set to always
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
@override_settings(OCR_MODE="skip")
|
||||
def test_multi_page_mixed(self):
|
||||
"""
|
||||
|
Reference in New Issue
Block a user