Try a new way of extracting text from a given PDF file

2026-01-24 22:39:02 -06:00 · 2023-01-01 15:57:22 -08:00
parent da38efebdf
commit 7be9ae9c02
2 changed files with 26 additions and 48 deletions
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -661,28 +661,14 @@ class TestParser(DirectoriesMixin, TestCase):
            - Text from the document is extracted
        """
        parser = RasterisedDocumentParser(None)
-        with mock.patch.object(
-            parser,
-            "construct_ocrmypdf_parameters",
-            wraps=parser.construct_ocrmypdf_parameters,
-        ) as wrapped:

-            parser.parse(
-                os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
-                "application/pdf",
-            )
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
+            "application/pdf",
+        )

-            # There isn't a good way to actually check this working, with RTL correctly return
-            #  as it would require tesseract-ocr-ara installed for everyone running the
-            #  test suite.  This test does provide the coverage though and attempts to ensure
-            # the force OCR happens
-            self.assertIsNotNone(parser.get_text())
-
-            self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
-            # Check the last call kwargs
-            self.assertTrue(
-                parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
-            )
+        # Copied from the PDF to here.  Don't even look at it
+        self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())


 class TestParserFileTypes(DirectoriesMixin, TestCase):