mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better
This commit is contained in:
parent
15cba8e14d
commit
a2b7687c3b
@ -13,6 +13,10 @@ class NoTextFoundException(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RtlLanguageException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class RasterisedDocumentParser(DocumentParser):
|
class RasterisedDocumentParser(DocumentParser):
|
||||||
"""
|
"""
|
||||||
This parser uses Tesseract to try and get some text out of a rasterised
|
This parser uses Tesseract to try and get some text out of a rasterised
|
||||||
@ -125,7 +129,26 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
||||||
|
|
||||||
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
||||||
|
|
||||||
|
# pdfminer.six does not handle RTL text
|
||||||
|
# as a hack, for some languages, return no text, to force
|
||||||
|
# OCRMyPdf/Tesseract do handle this correctly
|
||||||
|
from langdetect import detect
|
||||||
|
|
||||||
|
lang = detect(stripped)
|
||||||
|
|
||||||
|
self.log("debug", f"Detected language {lang}")
|
||||||
|
|
||||||
|
if lang in {
|
||||||
|
"ar", # Arabic
|
||||||
|
"he", # Hebrew,
|
||||||
|
"fa", # Persian
|
||||||
|
}:
|
||||||
|
raise RtlLanguageException()
|
||||||
return stripped
|
return stripped
|
||||||
|
except RtlLanguageException:
|
||||||
|
self.log("warning", f"Detected RTL language {lang}")
|
||||||
|
return None
|
||||||
except Exception:
|
except Exception:
|
||||||
# TODO catch all for various issues with PDFminer.six.
|
# TODO catch all for various issues with PDFminer.six.
|
||||||
# If PDFminer fails, fall back to OCR.
|
# If PDFminer fails, fall back to OCR.
|
||||||
@ -305,7 +328,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
)
|
)
|
||||||
if original_has_text:
|
if original_has_text:
|
||||||
self.text = text_original
|
self.text = text_original
|
||||||
except (NoTextFoundException, InputFileError) as e:
|
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
|
||||||
self.log(
|
self.log(
|
||||||
"warning",
|
"warning",
|
||||||
f"Encountered an error while running OCR: {str(e)}. "
|
f"Encountered an error while running OCR: {str(e)}. "
|
||||||
|
BIN
src/paperless_tesseract/tests/samples/rtl-test.pdf
Executable file
BIN
src/paperless_tesseract/tests/samples/rtl-test.pdf
Executable file
Binary file not shown.
@ -588,6 +588,39 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||||
self.assertNotIn("deskew", params)
|
self.assertNotIn("deskew", params)
|
||||||
|
|
||||||
|
def test_rtl_language_detection(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with text in an RTL language
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from the document is extracted
|
||||||
|
"""
|
||||||
|
parser = RasterisedDocumentParser(None)
|
||||||
|
with mock.patch.object(
|
||||||
|
parser,
|
||||||
|
"construct_ocrmypdf_parameters",
|
||||||
|
wraps=parser.construct_ocrmypdf_parameters,
|
||||||
|
) as wrapped:
|
||||||
|
|
||||||
|
parser.parse(
|
||||||
|
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
# There isn't a good way to actually check this working, with RTL correctly return
|
||||||
|
# as it would require tesseract-ocr-ara installed for everyone running the
|
||||||
|
# test suite. This test does provide the coverage though and attempts to ensure
|
||||||
|
# the force OCR happens
|
||||||
|
self.assertIsNotNone(parser.get_text())
|
||||||
|
|
||||||
|
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
|
||||||
|
# Check the last call kwargs
|
||||||
|
self.assertTrue(
|
||||||
|
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestParserFileTypes(DirectoriesMixin, TestCase):
|
class TestParserFileTypes(DirectoriesMixin, TestCase):
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user