catch encrypted pdf documents

2026-02-05 23:32:46 -06:00 · 2020-12-03 01:02:37 +01:00
parent 9e9b9ae631
commit 6a04e95f69
1 changed files with 12 additions and 9 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -7,7 +7,7 @@ import ocrmypdf
 import pdftotext
 from PIL import Image
 from django.conf import settings
-from ocrmypdf import InputFileError
+from ocrmypdf import InputFileError, EncryptedPdfError

 from documents.parsers import DocumentParser, ParseError, run_convert

@@ -83,11 +83,12 @@ class RasterisedDocumentParser(DocumentParser):
            return None

    def parse(self, document_path, mime_type):
-        if settings.OCR_MODE == "skip_noarchive":
-            text = get_text_from_pdf(document_path)
-            if text and len(text) > 50:
-                self.text = text
-                return
+        text_original = get_text_from_pdf(document_path)
+        has_text = text_original and len(text_original) > 50
+
+        if settings.OCR_MODE == "skip_noarchive" and has_text:
+            self.text = text_original
+            return

        archive_path = os.path.join(self.tempdir, "archive.pdf")

@@ -105,6 +106,8 @@ class RasterisedDocumentParser(DocumentParser):
        if settings.OCR_PAGES > 0:
            ocr_args['pages'] = f"1-{settings.OCR_PAGES}"

+        # Mode selection.
+
        if settings.OCR_MODE in ['skip', 'skip_noarchive']:
            ocr_args['skip_text'] = True
        elif settings.OCR_MODE == 'redo':
@@ -149,11 +152,11 @@ class RasterisedDocumentParser(DocumentParser):
            self.archive_path = archive_path
            self.text = get_text_from_pdf(archive_path)

-        except InputFileError as e:
+        except (InputFileError, EncryptedPdfError) as e:
            # This happens with some PDFs when used with the redo_ocr option.
            # This is not the end of the world, we'll just use what we already
            # have in the document.
-            self.text = get_text_from_pdf(document_path)
+            self.text = text_original
            # Also, no archived file.
            if not self.text:
                # However, if we don't have anything, fail:
@@ -169,7 +172,7 @@ class RasterisedDocumentParser(DocumentParser):
                'warning',
                f"Document {document_path} does not have any text."
                f"This is probably an error or you tried to add an image "
-                f"without text.")
+                f"without text, or something is wrong with this document.")
            self.text = ""