Merge pull request #1442 from paperless-ngx/fix/skip-archive-still-archiving

Bugfix: Fixes the creation of an archive file, even if noarchive was specified
This commit is contained in:
Quinn Casey 2022-08-25 06:23:10 -07:00 committed by GitHub
commit 1692bac3fe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 2 deletions

View File

@ -276,7 +276,10 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
self.archive_path = archive_path
# Only create archive file if archiving isn't being skipped
if settings.OCR_MODE != "skip_noarchive":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
if not self.text:

View File

@ -364,6 +364,16 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
@ -377,17 +387,29 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_notext(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
self.assertIsNone(parser.archive_path)
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
parser = RasterisedDocumentParser(None)
@ -408,6 +430,16 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_multi_page_mixed_no_archive(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),