Feature: page count (#7750)

--------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2026-01-30 23:08:59 -06:00 · 2024-09-25 17:22:12 +02:00
parent 4adf20af1e
commit c92c3e224a
23 changed files with 319 additions and 45 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -41,6 +41,15 @@ class RasterisedDocumentParser(DocumentParser):
        """
        return OcrConfig()

+    def get_page_count(self, document_path, mime_type):
+        page_count = None
+        if mime_type == "application/pdf":
+            import pikepdf
+
+            with pikepdf.Pdf.open(document_path) as pdf:
+                page_count = len(pdf.pages)
+        return page_count
+
    def extract_metadata(self, document_path, mime_type):
        result = []
        if mime_type == "application/pdf":
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -57,6 +57,30 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        self.assertContainsStrings(text.strip(), ["This is a test document."])

+    def test_get_page_count(self):
+        """
+        GIVEN:
+            - PDF file with a single page
+            - PDF file with multiple pages
+        WHEN:
+            - The number of pages is requested
+        THEN:
+            - The method returns 1 as the expected number of pages
+            - The method returns the correct number of pages (6)
+        """
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        page_count = parser.get_page_count(
+            os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertEqual(page_count, 1)
+
+        page_count = parser.get_page_count(
+            os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
+            "application/pdf",
+        )
+        self.assertEqual(page_count, 6)
+
    def test_thumbnail(self):
        parser = RasterisedDocumentParser(uuid.uuid4())
        thumb = parser.get_thumbnail(