diff --git a/src/paperless_tesseract/tests/samples/single-page-mixed.pdf b/src/paperless_tesseract/tests/samples/single-page-mixed.pdf new file mode 100644 index 000000000..2281fd389 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/single-page-mixed.pdf differ diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 858cc7701..67c1ad859 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -37,6 +37,9 @@ class FakeImageFile(ContextManager): class TestParser(DirectoriesMixin, TestCase): + + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + def assertContainsStrings(self, content, strings): # Asserts that all strings appear in content, in the given order. indices = [] @@ -47,14 +50,18 @@ class TestParser(DirectoriesMixin, TestCase): self.fail(f"'{s}' is not in '{content}'") self.assertListEqual(indices, sorted(indices)) - text_cases = [ - ("simple string", "simple string"), - ("simple newline\n testing string", "simple newline\ntesting string"), - ("utf-8 строка с пробелами в конце ", "utf-8 строка с пробелами в конце"), - ] - def test_post_process_text(self): - for source, result in self.text_cases: + + text_cases = [ + ("simple string", "simple string"), + ("simple newline\n testing string", "simple newline\ntesting string"), + ( + "utf-8 строка с пробелами в конце ", + "utf-8 строка с пробелами в конце", + ), + ] + + for source, result in text_cases: actual_result = post_process_text(source) self.assertEqual( result, @@ -66,8 +73,6 @@ class TestParser(DirectoriesMixin, TestCase): ), ) - SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") - def test_get_text_from_pdf(self): parser = RasterisedDocumentParser(uuid.uuid4()) text = parser.extract_text( @@ -461,6 +466,45 @@ class TestParser(DirectoriesMixin, TestCase): self.assertIn("[OCR skipped on page(s) 4-6]", sidecar) + @override_settings(OCR_MODE="redo") + def test_single_page_mixed(self): + """ + GIVEN: + - File with some text contained in images and some in text layer + - Text and images are mixed on the same page + - OCR mode set to redo + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - Full content of the file is parsed (not just the image text) + - An archive file is created with the OCRd text and the original text + """ + parser = RasterisedDocumentParser(None) + parser.parse( + os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"), + "application/pdf", + ) + self.assertIsNotNone(parser.archive_path) + self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertContainsStrings( + parser.get_text().lower(), + [ + "this is some normal text, present on page 1 of the document.", + "this is some text, but in an image, also on page 1.", + "this is further text on page 1.", + ], + ) + + with open(os.path.join(parser.tempdir, "sidecar.txt")) as f: + sidecar = f.read().lower() + + self.assertIn("this is some text, but in an image, also on page 1.", sidecar) + self.assertNotIn( + "this is some normal text, present on page 1 of the document.", + sidecar, + ) + @override_settings(OCR_MODE="skip_noarchive") def test_multi_page_mixed_no_archive(self): """