Feature: page count (#7750)

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
s0llvan
2024-09-25 17:22:12 +02:00
committed by GitHub
parent 4adf20af1e
commit c92c3e224a
23 changed files with 319 additions and 45 deletions

View File

@@ -41,6 +41,15 @@ class RasterisedDocumentParser(DocumentParser):
"""
return OcrConfig()
def get_page_count(self, document_path, mime_type):
page_count = None
if mime_type == "application/pdf":
import pikepdf
with pikepdf.Pdf.open(document_path) as pdf:
page_count = len(pdf.pages)
return page_count
def extract_metadata(self, document_path, mime_type):
result = []
if mime_type == "application/pdf":

View File

@@ -57,6 +57,30 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertContainsStrings(text.strip(), ["This is a test document."])
def test_get_page_count(self):
"""
GIVEN:
- PDF file with a single page
- PDF file with multiple pages
WHEN:
- The number of pages is requested
THEN:
- The method returns 1 as the expected number of pages
- The method returns the correct number of pages (6)
"""
parser = RasterisedDocumentParser(uuid.uuid4())
page_count = parser.get_page_count(
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 1)
page_count = parser.get_page_count(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 6)
def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(