diff --git a/Pipfile b/Pipfile
index 830604a8d..48759307c 100644
--- a/Pipfile
+++ b/Pipfile
@@ -27,6 +27,7 @@ langdetect = "*"
pdftotext = "*"
pathvalidate = "*"
pillow = "*"
+pikepdf = "*"
python-gnupg = "*"
python-dotenv = "*"
python-dateutil = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
index 198351237..1cfccb8ff 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "3c187671ead11714d48b56f4714b145f68814e09edea818610b87f18b4f7f6fd"
+ "sha256": "3d576f289958226a7583e4c471c7f8c11bff6933bf093185f623cfb381a92412"
},
"pipfile-spec": 6,
"requires": {
@@ -433,7 +433,7 @@
"sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
"sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
],
- "markers": "python_version < '3.9'",
+ "index": "pypi",
"version": "==2.2.0"
},
"pillow": {
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html
index 42619845c..e905c35e6 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.html
+++ b/src-ui/src/app/components/document-detail/document-detail.component.html
@@ -15,7 +15,7 @@
Download
-
+
diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts
index cf16f01c5..329077693 100644
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@@ -24,6 +24,9 @@ import { DocumentTypeEditDialogComponent } from '../manage/document-type-list/do
})
export class DocumentDetailComponent implements OnInit {
+ public expandOriginalMetadata = false;
+ public expandArchivedMetadata = false;
+
documentId: number
document: PaperlessDocument
metadata: PaperlessDocumentMetadata
diff --git a/src-ui/src/app/data/paperless-document-metadata.ts b/src-ui/src/app/data/paperless-document-metadata.ts
index 22b3f692a..12f0a78d8 100644
--- a/src-ui/src/app/data/paperless-document-metadata.ts
+++ b/src-ui/src/app/data/paperless-document-metadata.ts
@@ -1,11 +1,13 @@
export interface PaperlessDocumentMetadata {
- paperless__checksum?: string
+ original_checksum?: string
- paperless__mime_type?: string
+ archived_checksum?: string
- paperless__filename?: string
+ original_mime_type?: string
- paperless__has_archive_version?: boolean
+ media_filename?: string
+
+ has_archive_version?: boolean
}
\ No newline at end of file
diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py
index 986094db6..c2f9c950c 100644
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -1,4 +1,5 @@
import os
+import shutil
import tempfile
from unittest import mock
@@ -493,3 +494,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
async_task.assert_not_called()
+
+ def test_get_metadata(self):
+ doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png")
+
+ shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path)
+ shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path)
+
+ response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
+ self.assertEqual(response.status_code, 200)
+
+ meta = response.data
+
+ self.assertEqual(meta['original_mime_type'], "image/png")
+ self.assertTrue(meta['has_archive_version'])
+ self.assertEqual(len(meta['original_metadata']), 0)
+ self.assertGreater(len(meta['archive_metadata']), 0)
+
+ def test_get_metadata_no_archive(self):
+ doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")
+
+ shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.source_path)
+
+ response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
+ self.assertEqual(response.status_code, 200)
+
+ meta = response.data
+
+ self.assertEqual(meta['original_mime_type'], "application/pdf")
+ self.assertFalse(meta['has_archive_version'])
+ self.assertGreater(len(meta['original_metadata']), 0)
+ self.assertIsNone(meta['archive_metadata'])
diff --git a/src/documents/views.py b/src/documents/views.py
index 7d587ed3f..e058b0f56 100755
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -1,8 +1,11 @@
+import logging
import os
+import re
import tempfile
from datetime import datetime
from time import mktime
+import pikepdf
from django.conf import settings
from django.db.models import Count, Max
from django.http import HttpResponse, HttpResponseBadRequest, Http404
@@ -160,16 +163,49 @@ class DocumentViewSet(RetrieveModelMixin,
disposition, filename)
return response
+ def get_metadata(self, file, type):
+ if not os.path.isfile(file):
+ return None
+
+ namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+ result = []
+ if type == 'application/pdf':
+ pdf = pikepdf.open(file)
+ meta = pdf.open_metadata()
+ for key, value in meta.items():
+ if isinstance(value, list):
+ value = " ".join([str(e) for e in value])
+ value = str(value)
+ try:
+ m = namespace_pattern.match(key)
+ result.append({
+ "namespace": m.group(1),
+ "prefix": meta.REVERSE_NS[m.group(1)],
+ "key": m.group(2),
+ "value": value
+ })
+ except Exception as e:
+ logging.getLogger(__name__).warning(
+ f"Error while reading metadata {key}: {value}. Error: "
+ f"{e}"
+ )
+ return result
+
@action(methods=['get'], detail=True)
def metadata(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
return Response({
- "paperless__checksum": doc.checksum,
- "paperless__mime_type": doc.mime_type,
- "paperless__filename": doc.filename,
- "paperless__has_archive_version":
- os.path.isfile(doc.archive_path)
+ "original_checksum": doc.checksum,
+ "archived_checksum": doc.archive_checksum,
+ "original_mime_type": doc.mime_type,
+ "media_filename": doc.filename,
+ "has_archive_version": os.path.isfile(doc.archive_path),
+ "original_metadata": self.get_metadata(
+ doc.source_path, doc.mime_type),
+ "archive_metadata": self.get_metadata(
+ doc.archive_path, "application/pdf")
})
except Document.DoesNotExist:
raise Http404()