mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
fixes for the parser.
This commit is contained in:
parent
34bc4020c9
commit
dab4b1253a
@ -83,13 +83,27 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
mode = settings.OCR_MODE
|
||||
|
||||
text_original = get_text_from_pdf(document_path)
|
||||
has_text = text_original and len(text_original) > 50
|
||||
|
||||
if settings.OCR_MODE == "skip_noarchive" and has_text:
|
||||
if mode == "skip_noarchive" and has_text:
|
||||
self.log("debug",
|
||||
"Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
if mode in ['skip', 'skip_noarchive'] and not has_text:
|
||||
# upgrade to redo, since there appears to be no text in the
|
||||
# document. This happens to some weird encrypted documents or
|
||||
# documents with failed OCR attempts for which OCRmyPDF will
|
||||
# still report that there actually is text in them.
|
||||
self.log("debug",
|
||||
"No text was found in the document and skip is "
|
||||
"specified. Upgrading OCR mode to redo.")
|
||||
mode = "redo"
|
||||
|
||||
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
||||
|
||||
ocr_args = {
|
||||
@ -108,12 +122,15 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
# Mode selection.
|
||||
|
||||
if settings.OCR_MODE in ['skip', 'skip_noarchive']:
|
||||
if mode in ['skip', 'skip_noarchive']:
|
||||
ocr_args['skip_text'] = True
|
||||
elif settings.OCR_MODE == 'redo':
|
||||
elif mode == 'redo':
|
||||
ocr_args['redo_ocr'] = True
|
||||
elif settings.OCR_MODE == 'force':
|
||||
elif mode == 'force':
|
||||
ocr_args['force_ocr'] = True
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Invalid ocr mode: {mode}")
|
||||
|
||||
if self.is_image(mime_type):
|
||||
dpi = self.get_dpi(document_path)
|
||||
@ -153,6 +170,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.text = get_text_from_pdf(archive_path)
|
||||
|
||||
except (InputFileError, EncryptedPdfError) as e:
|
||||
|
||||
self.log("debug",
|
||||
f"Encountered an error: {e}. Trying to use text from "
|
||||
f"original.")
|
||||
# This happens with some PDFs when used with the redo_ocr option.
|
||||
# This is not the end of the world, we'll just use what we already
|
||||
# have in the document.
|
||||
|
Loading…
x
Reference in New Issue
Block a user