reading and displaying PDF metadata

This commit is contained in:
jonaswinkler 2020-12-08 15:28:09 +01:00
parent 9da11f29c7
commit ad527fe97c
7 changed files with 147 additions and 18 deletions

View File

@ -27,6 +27,7 @@ langdetect = "*"
pdftotext = "*"
pathvalidate = "*"
pillow = "*"
pikepdf = "*"
python-gnupg = "*"
python-dotenv = "*"
python-dateutil = "*"

4
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "3c187671ead11714d48b56f4714b145f68814e09edea818610b87f18b4f7f6fd"
"sha256": "3d576f289958226a7583e4c471c7f8c11bff6933bf093185f623cfb381a92412"
},
"pipfile-spec": 6,
"requires": {
@ -433,7 +433,7 @@
"sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
"sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
],
"markers": "python_version < '3.9'",
"index": "pypi",
"version": "==2.2.0"
},
"pillow": {

View File

@ -15,7 +15,7 @@
<span class="d-none d-lg-inline"> Download</span>
</a>
<div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version">
<div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.has_archive_version">
<button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button>
<div class="dropdown-menu" ngbDropdownMenu>
<a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a>
@ -72,6 +72,7 @@
<li [ngbNavItem]="3">
<a ngbNavLink>Metadata</a>
<ng-template ngbNavContent>
<table class="table table-borderless">
<tbody>
<tr>
@ -83,23 +84,76 @@
<td>{{document.added | date}}</td>
</tr>
<tr>
<td>MD5 Checksum</td>
<td>{{metadata?.paperless__checksum}}</td>
<td>Original MD5 Checksum</td>
<td>{{metadata?.original_checksum}}</td>
</tr>
<tr>
<td>Archive MD5 Checksum</td>
<td>{{metadata?.archived_checksum}}</td>
</tr>
<tr>
<td>Original mime type</td>
<td>{{metadata?.paperless__mime_type}}</td>
<td>{{metadata?.original_mime_type}}</td>
</tr>
<tr>
<td>Is archived?</td>
<td>{{metadata?.paperless__has_archive_version | yesno}}</td>
<td>{{metadata?.has_archive_version | yesno}}</td>
</tr>
<tr>
<td>Media filename</td>
<td>{{metadata?.paperless__filename}}</td>
<td>{{metadata?.media_filename}}</td>
</tr>
</tbody>
</table>
<h6 *ngIf="metadata?.original_metadata.length > 0">
<button type="button" class="btn btn-outline-secondary btn-sm mr-2"
(click)="expandOriginalMetadata = !expandOriginalMetadata" aria-controls="collapseExample">
<svg class="buttonicon" fill="currentColor" *ngIf="!expandOriginalMetadata">
<use xlink:href="assets/bootstrap-icons.svg#caret-down" />
</svg>
<svg class="buttonicon" fill="currentColor" *ngIf="expandOriginalMetadata">
<use xlink:href="assets/bootstrap-icons.svg#caret-up" />
</svg>
</button>
Original document metadata
</h6>
<div #collapse="ngbCollapse" [(ngbCollapse)]="!expandOriginalMetadata">
<table class="table table-borderless">
<tbody>
<tr *ngFor="let m of metadata?.original_metadata">
<td>{{m.prefix}}:{{m.key}}</td>
<td>{{m.value}}</td>
</tr>
</tbody>
</table>
</div>
<h6 *ngIf="metadata?.has_archive_version && metadata?.archive_metadata.length > 0">
<button type="button" class="btn btn-outline-secondary btn-sm mr-2"
(click)="expandArchivedMetadata = !expandArchivedMetadata" aria-controls="collapseExample">
<svg class="buttonicon" fill="currentColor" *ngIf="!expandArchivedMetadata">
<use xlink:href="assets/bootstrap-icons.svg#caret-down" />
</svg>
<svg class="buttonicon" fill="currentColor" *ngIf="expandArchivedMetadata">
<use xlink:href="assets/bootstrap-icons.svg#caret-up" />
</svg>
</button>
Archived document metadata
</h6>
<div #collapse="ngbCollapse" [(ngbCollapse)]="!expandArchivedMetadata">
<table class="table table-borderless">
<tbody>
<tr *ngFor="let m of metadata?.archive_metadata">
<td>{{m.prefix}}:{{m.key}}</td>
<td>{{m.value}}</td>
</tr>
</tbody>
</table>
</div>
</ng-template>
</li>
</ul>
@ -107,7 +161,8 @@
<div [ngbNavOutlet]="nav" class="mt-2"></div>
<button type="button" class="btn btn-outline-secondary" (click)="discard()">Discard</button>&nbsp;
<button type="button" class="btn btn-outline-primary" (click)="saveEditNext()" *ngIf="hasNext()">Save & edit next</button>&nbsp;
<button type="button" class="btn btn-outline-primary" (click)="saveEditNext()" *ngIf="hasNext()">Save & edit
next</button>&nbsp;
<button type="submit" class="btn btn-primary">Save</button>&nbsp;
</form>
</div>

View File

@ -24,6 +24,9 @@ import { DocumentTypeEditDialogComponent } from '../manage/document-type-list/do
})
export class DocumentDetailComponent implements OnInit {
public expandOriginalMetadata = false;
public expandArchivedMetadata = false;
documentId: number
document: PaperlessDocument
metadata: PaperlessDocumentMetadata

View File

@ -1,11 +1,13 @@
export interface PaperlessDocumentMetadata {
paperless__checksum?: string
original_checksum?: string
paperless__mime_type?: string
archived_checksum?: string
paperless__filename?: string
original_mime_type?: string
paperless__has_archive_version?: boolean
media_filename?: string
has_archive_version?: boolean
}

View File

@ -1,4 +1,5 @@
import os
import shutil
import tempfile
from unittest import mock
@ -493,3 +494,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
async_task.assert_not_called()
def test_get_metadata(self):
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png")
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path)
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path)
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
self.assertEqual(response.status_code, 200)
meta = response.data
self.assertEqual(meta['original_mime_type'], "image/png")
self.assertTrue(meta['has_archive_version'])
self.assertEqual(len(meta['original_metadata']), 0)
self.assertGreater(len(meta['archive_metadata']), 0)
def test_get_metadata_no_archive(self):
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.source_path)
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
self.assertEqual(response.status_code, 200)
meta = response.data
self.assertEqual(meta['original_mime_type'], "application/pdf")
self.assertFalse(meta['has_archive_version'])
self.assertGreater(len(meta['original_metadata']), 0)
self.assertIsNone(meta['archive_metadata'])

View File

@ -1,8 +1,11 @@
import logging
import os
import re
import tempfile
from datetime import datetime
from time import mktime
import pikepdf
from django.conf import settings
from django.db.models import Count, Max
from django.http import HttpResponse, HttpResponseBadRequest, Http404
@ -160,16 +163,49 @@ class DocumentViewSet(RetrieveModelMixin,
disposition, filename)
return response
def get_metadata(self, file, type):
if not os.path.isfile(file):
return None
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
if type == 'application/pdf':
pdf = pikepdf.open(file)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
result.append({
"namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2),
"value": value
})
except Exception as e:
logging.getLogger(__name__).warning(
f"Error while reading metadata {key}: {value}. Error: "
f"{e}"
)
return result
@action(methods=['get'], detail=True)
def metadata(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
return Response({
"paperless__checksum": doc.checksum,
"paperless__mime_type": doc.mime_type,
"paperless__filename": doc.filename,
"paperless__has_archive_version":
os.path.isfile(doc.archive_path)
"original_checksum": doc.checksum,
"archived_checksum": doc.archive_checksum,
"original_mime_type": doc.mime_type,
"media_filename": doc.filename,
"has_archive_version": os.path.isfile(doc.archive_path),
"original_metadata": self.get_metadata(
doc.source_path, doc.mime_type),
"archive_metadata": self.get_metadata(
doc.archive_path, "application/pdf")
})
except Document.DoesNotExist:
raise Http404()