Fixes the creation of an archive file, even if noarchive was specified

This commit is contained in:
Trenton Holmes 2022-08-20 13:47:56 -07:00
parent 0878a199f4
commit b3b2519bf0
2 changed files with 37 additions and 2 deletions

View File

@ -276,7 +276,10 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
self.archive_path = archive_path
# Only create archive file if archiving isn't being skipped
if settings.OCR_MODE != "skip_noarchive":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
if not self.text:

View File

@ -364,6 +364,16 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
@ -377,17 +387,29 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_notext(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
self.assertIsNone(parser.archive_path)
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
parser = RasterisedDocumentParser(None)
@ -408,6 +430,16 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_multi_page_mixed_no_archive(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),