diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index bde2ad25e..4cc9b8e5f 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -13,6 +13,10 @@ class NoTextFoundException(Exception): pass +class RtlLanguageException(Exception): + pass + + class RasterisedDocumentParser(DocumentParser): """ This parser uses Tesseract to try and get some text out of a rasterised @@ -125,7 +129,26 @@ class RasterisedDocumentParser(DocumentParser): stripped = post_process_text(pdfminer_extract_text(pdf_file)) self.log("debug", f"Extracted text from PDF file {pdf_file}") + + # pdfminer.six does not handle RTL text + # as a hack, for some languages, return no text, to force + # OCRMyPdf/Tesseract do handle this correctly + from langdetect import detect + + lang = detect(stripped) + + self.log("debug", f"Detected language {lang}") + + if lang in { + "ar", # Arabic + "he", # Hebrew, + "fa", # Persian + }: + raise RtlLanguageException() return stripped + except RtlLanguageException: + self.log("warning", f"Detected RTL language {lang}") + return None except Exception: # TODO catch all for various issues with PDFminer.six. # If PDFminer fails, fall back to OCR. @@ -305,7 +328,7 @@ class RasterisedDocumentParser(DocumentParser): ) if original_has_text: self.text = text_original - except (NoTextFoundException, InputFileError) as e: + except (NoTextFoundException, RtlLanguageException, InputFileError) as e: self.log( "warning", f"Encountered an error while running OCR: {str(e)}. " diff --git a/src/paperless_tesseract/tests/samples/rtl-test.pdf b/src/paperless_tesseract/tests/samples/rtl-test.pdf new file mode 100755 index 000000000..daa666f8b Binary files /dev/null and b/src/paperless_tesseract/tests/samples/rtl-test.pdf differ diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index a0550bde9..4d6890653 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -588,6 +588,39 @@ class TestParser(DirectoriesMixin, TestCase): params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn("deskew", params) + def test_rtl_language_detection(self): + """ + GIVEN: + - File with text in an RTL language + WHEN: + - Document is parsed + THEN: + - Text from the document is extracted + """ + parser = RasterisedDocumentParser(None) + with mock.patch.object( + parser, + "construct_ocrmypdf_parameters", + wraps=parser.construct_ocrmypdf_parameters, + ) as wrapped: + + parser.parse( + os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"), + "application/pdf", + ) + + # There isn't a good way to actually check this working, with RTL correctly return + # as it would require tesseract-ocr-ara installed for everyone running the + # test suite. This test does provide the coverage though and attempts to ensure + # the force OCR happens + self.assertIsNotNone(parser.get_text()) + + self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2) + # Check the last call kwargs + self.assertTrue( + parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"], + ) + class TestParserFileTypes(DirectoriesMixin, TestCase):