Adds a test to cover this edge case

2026-02-18 00:29:35 -06:00 · 2022-11-21 14:56:14 -08:00
parent b897d6de2e
commit f015556562
2 changed files with 53 additions and 9 deletions
--- a/src/paperless_tesseract/tests/samples/single-page-mixed.pdf
+++ b/src/paperless_tesseract/tests/samples/single-page-mixed.pdf
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -37,6 +37,9 @@ class FakeImageFile(ContextManager):
 class TestParser(DirectoriesMixin, TestCase):
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
    def assertContainsStrings(self, content, strings):
        # Asserts that all strings appear in content, in the given order.
        indices = []
@@ -47,14 +50,18 @@ class TestParser(DirectoriesMixin, TestCase):
                self.fail(f"'{s}' is not in '{content}'")
        self.assertListEqual(indices, sorted(indices))
    text_cases = [
        ("simple     string", "simple string"),
        ("simple    newline\n   testing string", "simple newline\ntesting string"),
        ("utf-8   строка с пробелами в конце  ", "utf-8 строка с пробелами в конце"),
    ]
    def test_post_process_text(self):
-        for source, result in self.text_cases:
+
        text_cases = [
            ("simple     string", "simple string"),
            ("simple    newline\n   testing string", "simple newline\ntesting string"),
            (
                "utf-8   строка с пробелами в конце  ",
                "utf-8 строка с пробелами в конце",
            ),
        ]
        for source, result in text_cases:
            actual_result = post_process_text(source)
            self.assertEqual(
                result,
@@ -66,8 +73,6 @@ class TestParser(DirectoriesMixin, TestCase):
                ),
            )
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
    def test_get_text_from_pdf(self):
        parser = RasterisedDocumentParser(uuid.uuid4())
        text = parser.extract_text(
@@ -461,6 +466,45 @@ class TestParser(DirectoriesMixin, TestCase):
        self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
    @override_settings(OCR_MODE="redo")
    def test_single_page_mixed(self):
        """
        GIVEN:
            - File with some text contained in images and some in text layer
            - Text and images are mixed on the same page
            - OCR mode set to redo
        WHEN:
            - Document is parsed
        THEN:
            - Text from images is extracted
            - Full content of the file is parsed (not just the image text)
            - An archive file is created with the OCRd text and the original text
        """
        parser = RasterisedDocumentParser(None)
        parser.parse(
            os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"),
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(
            parser.get_text().lower(),
            [
                "this is some normal text, present on page 1 of the document.",
                "this is some text, but in an image, also on page 1.",
                "this is further text on page 1.",
            ],
        )
        with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
            sidecar = f.read().lower()
        self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
        self.assertNotIn(
            "this is some normal text, present on page 1 of the document.",
            sidecar,
        )
    @override_settings(OCR_MODE="skip_noarchive")
    def test_multi_page_mixed_no_archive(self):
        """