From ad527fe97ca975d646994af9135cc673e0f6aced Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Tue, 8 Dec 2020 15:28:09 +0100 Subject: [PATCH] reading and displaying PDF metadata --- Pipfile | 1 + Pipfile.lock | 4 +- .../document-detail.component.html | 69 +++++++++++++++++-- .../document-detail.component.ts | 3 + .../app/data/paperless-document-metadata.ts | 10 +-- src/documents/tests/test_api.py | 32 +++++++++ src/documents/views.py | 46 +++++++++++-- 7 files changed, 147 insertions(+), 18 deletions(-) diff --git a/Pipfile b/Pipfile index 830604a8d..48759307c 100644 --- a/Pipfile +++ b/Pipfile @@ -27,6 +27,7 @@ langdetect = "*" pdftotext = "*" pathvalidate = "*" pillow = "*" +pikepdf = "*" python-gnupg = "*" python-dotenv = "*" python-dateutil = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 198351237..1cfccb8ff 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "3c187671ead11714d48b56f4714b145f68814e09edea818610b87f18b4f7f6fd" + "sha256": "3d576f289958226a7583e4c471c7f8c11bff6933bf093185f623cfb381a92412" }, "pipfile-spec": 6, "requires": { @@ -433,7 +433,7 @@ "sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52", "sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef" ], - "markers": "python_version < '3.9'", + "index": "pypi", "version": "==2.2.0" }, "pillow": { diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index 42619845c..e905c35e6 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -15,7 +15,7 @@ Download -
+
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index cf16f01c5..329077693 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -24,6 +24,9 @@ import { DocumentTypeEditDialogComponent } from '../manage/document-type-list/do }) export class DocumentDetailComponent implements OnInit { + public expandOriginalMetadata = false; + public expandArchivedMetadata = false; + documentId: number document: PaperlessDocument metadata: PaperlessDocumentMetadata diff --git a/src-ui/src/app/data/paperless-document-metadata.ts b/src-ui/src/app/data/paperless-document-metadata.ts index 22b3f692a..12f0a78d8 100644 --- a/src-ui/src/app/data/paperless-document-metadata.ts +++ b/src-ui/src/app/data/paperless-document-metadata.ts @@ -1,11 +1,13 @@ export interface PaperlessDocumentMetadata { - paperless__checksum?: string + original_checksum?: string - paperless__mime_type?: string + archived_checksum?: string - paperless__filename?: string + original_mime_type?: string - paperless__has_archive_version?: boolean + media_filename?: string + + has_archive_version?: boolean } \ No newline at end of file diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index 986094db6..c2f9c950c 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -1,4 +1,5 @@ import os +import shutil import tempfile from unittest import mock @@ -493,3 +494,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, 400) async_task.assert_not_called() + + def test_get_metadata(self): + doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png") + + shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path) + shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path) + + response = self.client.get(f"/api/documents/{doc.pk}/metadata/") + self.assertEqual(response.status_code, 200) + + meta = response.data + + self.assertEqual(meta['original_mime_type'], "image/png") + self.assertTrue(meta['has_archive_version']) + self.assertEqual(len(meta['original_metadata']), 0) + self.assertGreater(len(meta['archive_metadata']), 0) + + def test_get_metadata_no_archive(self): + doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf") + + shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.source_path) + + response = self.client.get(f"/api/documents/{doc.pk}/metadata/") + self.assertEqual(response.status_code, 200) + + meta = response.data + + self.assertEqual(meta['original_mime_type'], "application/pdf") + self.assertFalse(meta['has_archive_version']) + self.assertGreater(len(meta['original_metadata']), 0) + self.assertIsNone(meta['archive_metadata']) diff --git a/src/documents/views.py b/src/documents/views.py index 7d587ed3f..e058b0f56 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,8 +1,11 @@ +import logging import os +import re import tempfile from datetime import datetime from time import mktime +import pikepdf from django.conf import settings from django.db.models import Count, Max from django.http import HttpResponse, HttpResponseBadRequest, Http404 @@ -160,16 +163,49 @@ class DocumentViewSet(RetrieveModelMixin, disposition, filename) return response + def get_metadata(self, file, type): + if not os.path.isfile(file): + return None + + namespace_pattern = re.compile(r"\{(.*)\}(.*)") + + result = [] + if type == 'application/pdf': + pdf = pikepdf.open(file) + meta = pdf.open_metadata() + for key, value in meta.items(): + if isinstance(value, list): + value = " ".join([str(e) for e in value]) + value = str(value) + try: + m = namespace_pattern.match(key) + result.append({ + "namespace": m.group(1), + "prefix": meta.REVERSE_NS[m.group(1)], + "key": m.group(2), + "value": value + }) + except Exception as e: + logging.getLogger(__name__).warning( + f"Error while reading metadata {key}: {value}. Error: " + f"{e}" + ) + return result + @action(methods=['get'], detail=True) def metadata(self, request, pk=None): try: doc = Document.objects.get(pk=pk) return Response({ - "paperless__checksum": doc.checksum, - "paperless__mime_type": doc.mime_type, - "paperless__filename": doc.filename, - "paperless__has_archive_version": - os.path.isfile(doc.archive_path) + "original_checksum": doc.checksum, + "archived_checksum": doc.archive_checksum, + "original_mime_type": doc.mime_type, + "media_filename": doc.filename, + "has_archive_version": os.path.isfile(doc.archive_path), + "original_metadata": self.get_metadata( + doc.source_path, doc.mime_type), + "archive_metadata": self.get_metadata( + doc.archive_path, "application/pdf") }) except Document.DoesNotExist: raise Http404()