Adds a test to cover this edge case

This commit is contained in:
Trenton H 2022-11-21 14:56:14 -08:00
parent b897d6de2e
commit f015556562
2 changed files with 53 additions and 9 deletions

View File

@ -37,6 +37,9 @@ class FakeImageFile(ContextManager):
class TestParser(DirectoriesMixin, TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
@ -47,14 +50,18 @@ class TestParser(DirectoriesMixin, TestCase):
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
text_cases = [
("simple string", "simple string"),
("simple newline\n testing string", "simple newline\ntesting string"),
("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце"),
]
def test_post_process_text(self):
for source, result in self.text_cases:
text_cases = [
("simple string", "simple string"),
("simple newline\n testing string", "simple newline\ntesting string"),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце",
),
]
for source, result in text_cases:
actual_result = post_process_text(source)
self.assertEqual(
result,
@ -66,8 +73,6 @@ class TestParser(DirectoriesMixin, TestCase):
),
)
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def test_get_text_from_pdf(self):
parser = RasterisedDocumentParser(uuid.uuid4())
text = parser.extract_text(
@ -461,6 +466,45 @@ class TestParser(DirectoriesMixin, TestCase):
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
@override_settings(OCR_MODE="redo")
def test_single_page_mixed(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- Text and images are mixed on the same page
- OCR mode set to redo
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- Full content of the file is parsed (not just the image text)
- An archive file is created with the OCRd text and the original text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(
parser.get_text().lower(),
[
"this is some normal text, present on page 1 of the document.",
"this is some text, but in an image, also on page 1.",
"this is further text on page 1.",
],
)
with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
sidecar = f.read().lower()
self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
self.assertNotIn(
"this is some normal text, present on page 1 of the document.",
sidecar,
)
@override_settings(OCR_MODE="skip_noarchive")
def test_multi_page_mixed_no_archive(self):
"""