mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better
This commit is contained in:
@@ -13,6 +13,10 @@ class NoTextFoundException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RtlLanguageException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
"""
|
||||
This parser uses Tesseract to try and get some text out of a rasterised
|
||||
@@ -125,7 +129,26 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
||||
|
||||
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
||||
|
||||
# pdfminer.six does not handle RTL text
|
||||
# as a hack, for some languages, return no text, to force
|
||||
# OCRMyPdf/Tesseract do handle this correctly
|
||||
from langdetect import detect
|
||||
|
||||
lang = detect(stripped)
|
||||
|
||||
self.log("debug", f"Detected language {lang}")
|
||||
|
||||
if lang in {
|
||||
"ar", # Arabic
|
||||
"he", # Hebrew,
|
||||
"fa", # Persian
|
||||
}:
|
||||
raise RtlLanguageException()
|
||||
return stripped
|
||||
except RtlLanguageException:
|
||||
self.log("warning", f"Detected RTL language {lang}")
|
||||
return None
|
||||
except Exception:
|
||||
# TODO catch all for various issues with PDFminer.six.
|
||||
# If PDFminer fails, fall back to OCR.
|
||||
@@ -305,7 +328,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
)
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Encountered an error while running OCR: {str(e)}. "
|
||||
|
Reference in New Issue
Block a user