In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better

This commit is contained in:
Trenton H 2022-11-29 13:19:16 -08:00
parent 15cba8e14d
commit a2b7687c3b
3 changed files with 57 additions and 1 deletions

View File

@ -13,6 +13,10 @@ class NoTextFoundException(Exception):
pass
class RtlLanguageException(Exception):
pass
class RasterisedDocumentParser(DocumentParser):
"""
This parser uses Tesseract to try and get some text out of a rasterised
@ -125,7 +129,26 @@ class RasterisedDocumentParser(DocumentParser):
stripped = post_process_text(pdfminer_extract_text(pdf_file))
self.log("debug", f"Extracted text from PDF file {pdf_file}")
# pdfminer.six does not handle RTL text
# as a hack, for some languages, return no text, to force
# OCRMyPdf/Tesseract do handle this correctly
from langdetect import detect
lang = detect(stripped)
self.log("debug", f"Detected language {lang}")
if lang in {
"ar", # Arabic
"he", # Hebrew,
"fa", # Persian
}:
raise RtlLanguageException()
return stripped
except RtlLanguageException:
self.log("warning", f"Detected RTL language {lang}")
return None
except Exception:
# TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR.
@ -305,7 +328,7 @@ class RasterisedDocumentParser(DocumentParser):
)
if original_has_text:
self.text = text_original
except (NoTextFoundException, InputFileError) as e:
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
self.log(
"warning",
f"Encountered an error while running OCR: {str(e)}. "

Binary file not shown.

View File

@ -588,6 +588,39 @@ class TestParser(DirectoriesMixin, TestCase):
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
def test_rtl_language_detection(self):
"""
GIVEN:
- File with text in an RTL language
WHEN:
- Document is parsed
THEN:
- Text from the document is extracted
"""
parser = RasterisedDocumentParser(None)
with mock.patch.object(
parser,
"construct_ocrmypdf_parameters",
wraps=parser.construct_ocrmypdf_parameters,
) as wrapped:
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
# There isn't a good way to actually check this working, with RTL correctly return
# as it would require tesseract-ocr-ara installed for everyone running the
# test suite. This test does provide the coverage though and attempts to ensure
# the force OCR happens
self.assertIsNotNone(parser.get_text())
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
# Check the last call kwargs
self.assertTrue(
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
)
class TestParserFileTypes(DirectoriesMixin, TestCase):