Fix: handle page count exception for pw-protected files (#8240)

This commit is contained in:
shamoon 2024-11-10 03:33:47 -08:00 committed by GitHub
parent c22a80abd3
commit a6f4c75a72
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 26 additions and 3 deletions

View File

@ -43,10 +43,15 @@ class RasterisedDocumentParser(DocumentParser):
def get_page_count(self, document_path, mime_type):
page_count = None
if mime_type == "application/pdf":
import pikepdf
try:
import pikepdf
with pikepdf.Pdf.open(document_path) as pdf:
page_count = len(pdf.pages)
with pikepdf.Pdf.open(document_path) as pdf:
page_count = len(pdf.pages)
except Exception as e:
self.log.warning(
f"Unable to determine PDF page count {document_path}: {e}",
)
return page_count
def extract_metadata(self, document_path, mime_type):

View File

@ -81,6 +81,24 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
)
self.assertEqual(page_count, 6)
def test_get_page_count_password_protected(self):
"""
GIVEN:
- Password protected PDF file
WHEN:
- The number of pages is requested
THEN:
- The method returns None
"""
parser = RasterisedDocumentParser(uuid.uuid4())
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
page_count = parser.get_page_count(
os.path.join(self.SAMPLE_FILES, "password-protected.pdf"),
"application/pdf",
)
self.assertEqual(page_count, None)
self.assertIn("Unable to determine PDF page count", cm.output[0])
def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(