In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better

2026-01-20 22:24:24 -06:00 · 2022-11-29 13:19:16 -08:00
parent 15cba8e14d
commit a2b7687c3b
3 changed files with 57 additions and 1 deletions
--- a/src/paperless_tesseract/tests/samples/rtl-test.pdf
+++ b/src/paperless_tesseract/tests/samples/rtl-test.pdf
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -588,6 +588,39 @@ class TestParser(DirectoriesMixin, TestCase):
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertNotIn("deskew", params)

+    def test_rtl_language_detection(self):
+        """
+        GIVEN:
+            - File with text in an RTL language
+        WHEN:
+            - Document is parsed
+        THEN:
+            - Text from the document is extracted
+        """
+        parser = RasterisedDocumentParser(None)
+        with mock.patch.object(
+            parser,
+            "construct_ocrmypdf_parameters",
+            wraps=parser.construct_ocrmypdf_parameters,
+        ) as wrapped:
+
+            parser.parse(
+                os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
+                "application/pdf",
+            )
+
+            # There isn't a good way to actually check this working, with RTL correctly return
+            #  as it would require tesseract-ocr-ara installed for everyone running the
+            #  test suite.  This test does provide the coverage though and attempts to ensure
+            # the force OCR happens
+            self.assertIsNotNone(parser.get_text())
+
+            self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
+            # Check the last call kwargs
+            self.assertTrue(
+                parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
+            )
+

 class TestParserFileTypes(DirectoriesMixin, TestCase):