Feature: page count (#7750)

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
s0llvan
2024-09-25 17:22:12 +02:00
committed by GitHub
parent 4adf20af1e
commit c92c3e224a
23 changed files with 319 additions and 45 deletions

View File

@@ -387,6 +387,8 @@ def delete_pages(doc_ids: list[int], pages: list[int]):
pdf.remove_unreferenced_resources()
pdf.save()
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
if doc.page_count is not None:
doc.page_count = doc.page_count - len(pages)
doc.save()
update_document_archive_file.delay(document_id=doc.id)
logger.info(f"Deleted pages {pages} from document {doc.id}")

View File

@@ -586,6 +586,7 @@ class ConsumerPlugin(
date = None
thumbnail = None
archive_path = None
page_count = None
try:
self._send_progress(
@@ -621,6 +622,7 @@ class ConsumerPlugin(
)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
page_count = document_parser.get_page_count(self.working_copy, mime_type)
except ParseError as e:
document_parser.cleanup()
@@ -662,7 +664,12 @@ class ConsumerPlugin(
try:
with transaction.atomic():
# store the document.
document = self._store(text=text, date=date, mime_type=mime_type)
document = self._store(
text=text,
date=date,
page_count=page_count,
mime_type=mime_type,
)
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
@@ -790,6 +797,7 @@ class ConsumerPlugin(
self,
text: str,
date: Optional[datetime.datetime],
page_count: Optional[int],
mime_type: str,
) -> Document:
# If someone gave us the original filename, use it instead of doc.
@@ -835,6 +843,7 @@ class ConsumerPlugin(
created=create_date,
modified=create_date,
storage_type=storage_type,
page_count=page_count,
original_filename=self.filename,
)

View File

@@ -80,6 +80,7 @@ def get_schema():
has_owner=BOOLEAN(),
viewer_id=KEYWORD(commas=True),
checksum=TEXT(),
page_count=NUMERIC(sortable=True),
original_filename=TEXT(sortable=True),
is_shared=BOOLEAN(),
)
@@ -181,6 +182,7 @@ def update_document(writer: AsyncWriter, doc: Document):
has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None,
checksum=doc.checksum,
page_count=doc.page_count,
original_filename=doc.original_filename,
is_shared=len(viewer_ids) > 0,
)
@@ -247,6 +249,7 @@ class DelayedQuery:
"archive_serial_number": "asn",
"num_notes": "num_notes",
"owner": "owner",
"page_count": "page_count",
}
if field.startswith("-"):

View File

@@ -0,0 +1,62 @@
# Generated by Django 4.2.16 on 2024-09-21 15:44
from pathlib import Path
import pikepdf
from django.conf import settings
from django.db import migrations
from django.db import models
from django.utils.termcolors import colorize as colourise
def source_path(self):
if self.filename:
fname = str(self.filename)
return Path(settings.ORIGINALS_DIR / fname).resolve()
def add_number_of_pages_to_page_count(apps, schema_editor):
Document = apps.get_model("documents", "Document")
if not Document.objects.all().exists():
return
for doc in Document.objects.filter(mime_type="application/pdf"):
print(
" {} {} {}".format(
colourise("*", fg="green"),
colourise("Calculating number of pages for", fg="white"),
colourise(doc.filename, fg="cyan"),
),
)
try:
with pikepdf.Pdf.open(source_path(doc)) as pdf:
if pdf.pages is not None:
doc.page_count = len(pdf.pages)
doc.save()
except Exception as e: # pragma: no cover
print(f"Error retrieving number of pages for {doc.filename}: {e}")
class Migration(migrations.Migration):
dependencies = [
("documents", "1052_document_transaction_id"),
]
operations = [
migrations.AddField(
model_name="document",
name="page_count",
field=models.PositiveIntegerField(
blank=False,
null=True,
unique=False,
db_index=False,
),
),
migrations.RunPython(
add_number_of_pages_to_page_count,
migrations.RunPython.noop,
),
]

View File

@@ -205,6 +205,18 @@ class Document(SoftDeleteModel, ModelWithOwner):
help_text=_("The checksum of the archived document."),
)
page_count = models.PositiveIntegerField(
_("page count"),
blank=False,
null=True,
unique=False,
db_index=False,
validators=[MinValueValidator(1)],
help_text=_(
"The number of pages of the document.",
),
)
created = models.DateTimeField(_("created"), default=timezone.now, db_index=True)
modified = models.DateTimeField(
@@ -414,6 +426,7 @@ class SavedView(ModelWithOwner):
OWNER = ("owner", _("Owner"))
SHARED = ("shared", _("Shared"))
ASN = ("asn", _("ASN"))
PAGE_COUNT = ("pagecount", _("Pages"))
CUSTOM_FIELD = ("custom_field_%d", ("Custom Field"))
name = models.CharField(_("name"), max_length=128)

View File

@@ -367,6 +367,9 @@ class DocumentParser(LoggingMixin):
def extract_metadata(self, document_path, mime_type):
return []
def get_page_count(self, document_path, mime_type):
return None
def parse(self, document_path, mime_type, file_name=None):
raise NotImplementedError

View File

@@ -750,6 +750,7 @@ class DocumentSerializer(
original_file_name = SerializerMethodField()
archived_file_name = SerializerMethodField()
created_date = serializers.DateField(required=False)
page_count = SerializerMethodField()
custom_fields = CustomFieldInstanceSerializer(
many=True,
@@ -770,6 +771,9 @@ class DocumentSerializer(
required=False,
)
def get_page_count(self, obj):
return obj.page_count
def get_original_file_name(self, obj):
return obj.original_filename
@@ -885,6 +889,7 @@ class DocumentSerializer(
"notes",
"custom_fields",
"remove_inbox_tags",
"page_count",
)
list_serializer_class = OwnedObjectListSerializer

View File

@@ -389,6 +389,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
title="B",
filename=sample2,
mime_type="application/pdf",
page_count=8,
)
self.doc2.archive_filename = sample2_archive
self.doc2.save()
@@ -681,14 +682,20 @@ class TestPDFActions(DirectoriesMixin, TestCase):
THEN:
- Save should be called once
- Archive file should be updated once
- The document's page_count should be reduced by the number of deleted pages
"""
doc_ids = [self.doc2.id]
initial_page_count = self.doc2.page_count
pages = [1, 3]
result = bulk_edit.delete_pages(doc_ids, pages)
mock_pdf_save.assert_called_once()
mock_update_archive_file.assert_called_once()
self.assertEqual(result, "OK")
expected_page_count = initial_page_count - len(pages)
self.doc2.refresh_from_db()
self.assertEqual(self.doc2.page_count, expected_page_count)
@mock.patch("documents.tasks.update_document_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file):

View File

@@ -0,0 +1,59 @@
import os
import shutil
from pathlib import Path
from django.conf import settings
from documents.tests.utils import TestMigrations
def source_path_before(self):
if self.filename:
fname = str(self.filename)
return os.path.join(settings.ORIGINALS_DIR, fname)
class TestMigrateDocumentPageCount(TestMigrations):
migrate_from = "1052_document_transaction_id"
migrate_to = "1053_document_page_count"
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
doc = Document.objects.create(
title="test1",
mime_type="application/pdf",
filename="file1.pdf",
)
self.doc_id = doc.id
shutil.copy(
Path(__file__).parent / "samples" / "simple.pdf",
source_path_before(doc),
)
def testDocumentPageCountMigrated(self):
Document = self.apps.get_model("documents", "Document")
doc = Document.objects.get(id=self.doc_id)
self.assertEqual(doc.page_count, 1)
class TestMigrateDocumentPageCountBackwards(TestMigrations):
migrate_from = "1053_document_page_count"
migrate_to = "1052_document_transaction_id"
def setUpBeforeMigration(self, apps):
Document = apps.get_model("documents", "Document")
doc = Document.objects.create(
title="test1",
mime_type="application/pdf",
filename="file1.pdf",
page_count=8,
)
self.doc_id = doc.id
def test_remove_number_of_pages_to_page_count(self):
Document = self.apps.get_model("documents", "Document")
self.assertFalse(
"page_count" in [field.name for field in Document._meta.get_fields()],
)

View File

@@ -361,6 +361,7 @@ class DocumentViewSet(
"archive_serial_number",
"num_notes",
"owner",
"page_count",
)
def get_queryset(self):