diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 6b9ec3d93..95c1dbfcc 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -43,10 +43,15 @@ class RasterisedDocumentParser(DocumentParser): def get_page_count(self, document_path, mime_type): page_count = None if mime_type == "application/pdf": - import pikepdf + try: + import pikepdf - with pikepdf.Pdf.open(document_path) as pdf: - page_count = len(pdf.pages) + with pikepdf.Pdf.open(document_path) as pdf: + page_count = len(pdf.pages) + except Exception as e: + self.log.warning( + f"Unable to determine PDF page count {document_path}: {e}", + ) return page_count def extract_metadata(self, document_path, mime_type): diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 45a5939ab..f7490fbbf 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -81,6 +81,24 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ) self.assertEqual(page_count, 6) + def test_get_page_count_password_protected(self): + """ + GIVEN: + - Password protected PDF file + WHEN: + - The number of pages is requested + THEN: + - The method returns None + """ + parser = RasterisedDocumentParser(uuid.uuid4()) + with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm: + page_count = parser.get_page_count( + os.path.join(self.SAMPLE_FILES, "password-protected.pdf"), + "application/pdf", + ) + self.assertEqual(page_count, None) + self.assertIn("Unable to determine PDF page count", cm.output[0]) + def test_thumbnail(self): parser = RasterisedDocumentParser(uuid.uuid4()) thumb = parser.get_thumbnail(