Try a new way of extracting text from a given PDF file

This commit is contained in:
Trenton Holmes
2023-01-01 15:57:22 -08:00
committed by Trenton H
parent da38efebdf
commit 7be9ae9c02
2 changed files with 26 additions and 48 deletions

View File

@@ -661,28 +661,14 @@ class TestParser(DirectoriesMixin, TestCase):
- Text from the document is extracted
"""
parser = RasterisedDocumentParser(None)
with mock.patch.object(
parser,
"construct_ocrmypdf_parameters",
wraps=parser.construct_ocrmypdf_parameters,
) as wrapped:
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
# There isn't a good way to actually check this working, with RTL correctly return
# as it would require tesseract-ocr-ara installed for everyone running the
# test suite. This test does provide the coverage though and attempts to ensure
# the force OCR happens
self.assertIsNotNone(parser.get_text())
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
# Check the last call kwargs
self.assertTrue(
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
)
# Copied from the PDF to here. Don't even look at it
self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
class TestParserFileTypes(DirectoriesMixin, TestCase):