catch encrypted pdf documents

This commit is contained in:
jonaswinkler 2020-12-03 01:02:37 +01:00
parent 9e9b9ae631
commit 6a04e95f69

View File

@ -7,7 +7,7 @@ import ocrmypdf
import pdftotext import pdftotext
from PIL import Image from PIL import Image
from django.conf import settings from django.conf import settings
from ocrmypdf import InputFileError from ocrmypdf import InputFileError, EncryptedPdfError
from documents.parsers import DocumentParser, ParseError, run_convert from documents.parsers import DocumentParser, ParseError, run_convert
@ -83,11 +83,12 @@ class RasterisedDocumentParser(DocumentParser):
return None return None
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type):
if settings.OCR_MODE == "skip_noarchive": text_original = get_text_from_pdf(document_path)
text = get_text_from_pdf(document_path) has_text = text_original and len(text_original) > 50
if text and len(text) > 50:
self.text = text if settings.OCR_MODE == "skip_noarchive" and has_text:
return self.text = text_original
return
archive_path = os.path.join(self.tempdir, "archive.pdf") archive_path = os.path.join(self.tempdir, "archive.pdf")
@ -105,6 +106,8 @@ class RasterisedDocumentParser(DocumentParser):
if settings.OCR_PAGES > 0: if settings.OCR_PAGES > 0:
ocr_args['pages'] = f"1-{settings.OCR_PAGES}" ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
# Mode selection.
if settings.OCR_MODE in ['skip', 'skip_noarchive']: if settings.OCR_MODE in ['skip', 'skip_noarchive']:
ocr_args['skip_text'] = True ocr_args['skip_text'] = True
elif settings.OCR_MODE == 'redo': elif settings.OCR_MODE == 'redo':
@ -149,11 +152,11 @@ class RasterisedDocumentParser(DocumentParser):
self.archive_path = archive_path self.archive_path = archive_path
self.text = get_text_from_pdf(archive_path) self.text = get_text_from_pdf(archive_path)
except InputFileError as e: except (InputFileError, EncryptedPdfError) as e:
# This happens with some PDFs when used with the redo_ocr option. # This happens with some PDFs when used with the redo_ocr option.
# This is not the end of the world, we'll just use what we already # This is not the end of the world, we'll just use what we already
# have in the document. # have in the document.
self.text = get_text_from_pdf(document_path) self.text = text_original
# Also, no archived file. # Also, no archived file.
if not self.text: if not self.text:
# However, if we don't have anything, fail: # However, if we don't have anything, fail:
@ -169,7 +172,7 @@ class RasterisedDocumentParser(DocumentParser):
'warning', 'warning',
f"Document {document_path} does not have any text." f"Document {document_path} does not have any text."
f"This is probably an error or you tried to add an image " f"This is probably an error or you tried to add an image "
f"without text.") f"without text, or something is wrong with this document.")
self.text = "" self.text = ""