mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
catch encrypted pdf documents
This commit is contained in:
parent
9e9b9ae631
commit
6a04e95f69
@ -7,7 +7,7 @@ import ocrmypdf
|
|||||||
import pdftotext
|
import pdftotext
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from ocrmypdf import InputFileError
|
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||||
|
|
||||||
from documents.parsers import DocumentParser, ParseError, run_convert
|
from documents.parsers import DocumentParser, ParseError, run_convert
|
||||||
|
|
||||||
@ -83,10 +83,11 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type):
|
||||||
if settings.OCR_MODE == "skip_noarchive":
|
text_original = get_text_from_pdf(document_path)
|
||||||
text = get_text_from_pdf(document_path)
|
has_text = text_original and len(text_original) > 50
|
||||||
if text and len(text) > 50:
|
|
||||||
self.text = text
|
if settings.OCR_MODE == "skip_noarchive" and has_text:
|
||||||
|
self.text = text_original
|
||||||
return
|
return
|
||||||
|
|
||||||
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
||||||
@ -105,6 +106,8 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if settings.OCR_PAGES > 0:
|
if settings.OCR_PAGES > 0:
|
||||||
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
|
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
|
||||||
|
|
||||||
|
# Mode selection.
|
||||||
|
|
||||||
if settings.OCR_MODE in ['skip', 'skip_noarchive']:
|
if settings.OCR_MODE in ['skip', 'skip_noarchive']:
|
||||||
ocr_args['skip_text'] = True
|
ocr_args['skip_text'] = True
|
||||||
elif settings.OCR_MODE == 'redo':
|
elif settings.OCR_MODE == 'redo':
|
||||||
@ -149,11 +152,11 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.archive_path = archive_path
|
self.archive_path = archive_path
|
||||||
self.text = get_text_from_pdf(archive_path)
|
self.text = get_text_from_pdf(archive_path)
|
||||||
|
|
||||||
except InputFileError as e:
|
except (InputFileError, EncryptedPdfError) as e:
|
||||||
# This happens with some PDFs when used with the redo_ocr option.
|
# This happens with some PDFs when used with the redo_ocr option.
|
||||||
# This is not the end of the world, we'll just use what we already
|
# This is not the end of the world, we'll just use what we already
|
||||||
# have in the document.
|
# have in the document.
|
||||||
self.text = get_text_from_pdf(document_path)
|
self.text = text_original
|
||||||
# Also, no archived file.
|
# Also, no archived file.
|
||||||
if not self.text:
|
if not self.text:
|
||||||
# However, if we don't have anything, fail:
|
# However, if we don't have anything, fail:
|
||||||
@ -169,7 +172,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
'warning',
|
'warning',
|
||||||
f"Document {document_path} does not have any text."
|
f"Document {document_path} does not have any text."
|
||||||
f"This is probably an error or you tried to add an image "
|
f"This is probably an error or you tried to add an image "
|
||||||
f"without text.")
|
f"without text, or something is wrong with this document.")
|
||||||
self.text = ""
|
self.text = ""
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user