reading and displaying PDF metadata

This commit is contained in:
jonaswinkler 2020-12-08 15:28:09 +01:00
parent 9da11f29c7
commit ad527fe97c
7 changed files with 147 additions and 18 deletions

View File

@ -27,6 +27,7 @@ langdetect = "*"
pdftotext = "*" pdftotext = "*"
pathvalidate = "*" pathvalidate = "*"
pillow = "*" pillow = "*"
pikepdf = "*"
python-gnupg = "*" python-gnupg = "*"
python-dotenv = "*" python-dotenv = "*"
python-dateutil = "*" python-dateutil = "*"

4
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "3c187671ead11714d48b56f4714b145f68814e09edea818610b87f18b4f7f6fd" "sha256": "3d576f289958226a7583e4c471c7f8c11bff6933bf093185f623cfb381a92412"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -433,7 +433,7 @@
"sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52", "sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
"sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef" "sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
], ],
"markers": "python_version < '3.9'", "index": "pypi",
"version": "==2.2.0" "version": "==2.2.0"
}, },
"pillow": { "pillow": {

View File

@ -15,7 +15,7 @@
<span class="d-none d-lg-inline"> Download</span> <span class="d-none d-lg-inline"> Download</span>
</a> </a>
<div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version"> <div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.has_archive_version">
<button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button> <button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button>
<div class="dropdown-menu" ngbDropdownMenu> <div class="dropdown-menu" ngbDropdownMenu>
<a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a> <a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a>
@ -72,6 +72,7 @@
<li [ngbNavItem]="3"> <li [ngbNavItem]="3">
<a ngbNavLink>Metadata</a> <a ngbNavLink>Metadata</a>
<ng-template ngbNavContent> <ng-template ngbNavContent>
<table class="table table-borderless"> <table class="table table-borderless">
<tbody> <tbody>
<tr> <tr>
@ -83,23 +84,76 @@
<td>{{document.added | date}}</td> <td>{{document.added | date}}</td>
</tr> </tr>
<tr> <tr>
<td>MD5 Checksum</td> <td>Original MD5 Checksum</td>
<td>{{metadata?.paperless__checksum}}</td> <td>{{metadata?.original_checksum}}</td>
</tr>
<tr>
<td>Archive MD5 Checksum</td>
<td>{{metadata?.archived_checksum}}</td>
</tr> </tr>
<tr> <tr>
<td>Original mime type</td> <td>Original mime type</td>
<td>{{metadata?.paperless__mime_type}}</td> <td>{{metadata?.original_mime_type}}</td>
</tr> </tr>
<tr> <tr>
<td>Is archived?</td> <td>Is archived?</td>
<td>{{metadata?.paperless__has_archive_version | yesno}}</td> <td>{{metadata?.has_archive_version | yesno}}</td>
</tr> </tr>
<tr> <tr>
<td>Media filename</td> <td>Media filename</td>
<td>{{metadata?.paperless__filename}}</td> <td>{{metadata?.media_filename}}</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
<h6 *ngIf="metadata?.original_metadata.length > 0">
<button type="button" class="btn btn-outline-secondary btn-sm mr-2"
(click)="expandOriginalMetadata = !expandOriginalMetadata" aria-controls="collapseExample">
<svg class="buttonicon" fill="currentColor" *ngIf="!expandOriginalMetadata">
<use xlink:href="assets/bootstrap-icons.svg#caret-down" />
</svg>
<svg class="buttonicon" fill="currentColor" *ngIf="expandOriginalMetadata">
<use xlink:href="assets/bootstrap-icons.svg#caret-up" />
</svg>
</button>
Original document metadata
</h6>
<div #collapse="ngbCollapse" [(ngbCollapse)]="!expandOriginalMetadata">
<table class="table table-borderless">
<tbody>
<tr *ngFor="let m of metadata?.original_metadata">
<td>{{m.prefix}}:{{m.key}}</td>
<td>{{m.value}}</td>
</tr>
</tbody>
</table>
</div>
<h6 *ngIf="metadata?.has_archive_version && metadata?.archive_metadata.length > 0">
<button type="button" class="btn btn-outline-secondary btn-sm mr-2"
(click)="expandArchivedMetadata = !expandArchivedMetadata" aria-controls="collapseExample">
<svg class="buttonicon" fill="currentColor" *ngIf="!expandArchivedMetadata">
<use xlink:href="assets/bootstrap-icons.svg#caret-down" />
</svg>
<svg class="buttonicon" fill="currentColor" *ngIf="expandArchivedMetadata">
<use xlink:href="assets/bootstrap-icons.svg#caret-up" />
</svg>
</button>
Archived document metadata
</h6>
<div #collapse="ngbCollapse" [(ngbCollapse)]="!expandArchivedMetadata">
<table class="table table-borderless">
<tbody>
<tr *ngFor="let m of metadata?.archive_metadata">
<td>{{m.prefix}}:{{m.key}}</td>
<td>{{m.value}}</td>
</tr>
</tbody>
</table>
</div>
</ng-template> </ng-template>
</li> </li>
</ul> </ul>
@ -107,7 +161,8 @@
<div [ngbNavOutlet]="nav" class="mt-2"></div> <div [ngbNavOutlet]="nav" class="mt-2"></div>
<button type="button" class="btn btn-outline-secondary" (click)="discard()">Discard</button>&nbsp; <button type="button" class="btn btn-outline-secondary" (click)="discard()">Discard</button>&nbsp;
<button type="button" class="btn btn-outline-primary" (click)="saveEditNext()" *ngIf="hasNext()">Save & edit next</button>&nbsp; <button type="button" class="btn btn-outline-primary" (click)="saveEditNext()" *ngIf="hasNext()">Save & edit
next</button>&nbsp;
<button type="submit" class="btn btn-primary">Save</button>&nbsp; <button type="submit" class="btn btn-primary">Save</button>&nbsp;
</form> </form>
</div> </div>

View File

@ -24,6 +24,9 @@ import { DocumentTypeEditDialogComponent } from '../manage/document-type-list/do
}) })
export class DocumentDetailComponent implements OnInit { export class DocumentDetailComponent implements OnInit {
public expandOriginalMetadata = false;
public expandArchivedMetadata = false;
documentId: number documentId: number
document: PaperlessDocument document: PaperlessDocument
metadata: PaperlessDocumentMetadata metadata: PaperlessDocumentMetadata

View File

@ -1,11 +1,13 @@
export interface PaperlessDocumentMetadata { export interface PaperlessDocumentMetadata {
paperless__checksum?: string original_checksum?: string
paperless__mime_type?: string archived_checksum?: string
paperless__filename?: string original_mime_type?: string
paperless__has_archive_version?: boolean media_filename?: string
has_archive_version?: boolean
} }

View File

@ -1,4 +1,5 @@
import os import os
import shutil
import tempfile import tempfile
from unittest import mock from unittest import mock
@ -493,3 +494,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400) self.assertEqual(response.status_code, 400)
async_task.assert_not_called() async_task.assert_not_called()
def test_get_metadata(self):
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png")
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path)
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path)
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
self.assertEqual(response.status_code, 200)
meta = response.data
self.assertEqual(meta['original_mime_type'], "image/png")
self.assertTrue(meta['has_archive_version'])
self.assertEqual(len(meta['original_metadata']), 0)
self.assertGreater(len(meta['archive_metadata']), 0)
def test_get_metadata_no_archive(self):
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.source_path)
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
self.assertEqual(response.status_code, 200)
meta = response.data
self.assertEqual(meta['original_mime_type'], "application/pdf")
self.assertFalse(meta['has_archive_version'])
self.assertGreater(len(meta['original_metadata']), 0)
self.assertIsNone(meta['archive_metadata'])

View File

@ -1,8 +1,11 @@
import logging
import os import os
import re
import tempfile import tempfile
from datetime import datetime from datetime import datetime
from time import mktime from time import mktime
import pikepdf
from django.conf import settings from django.conf import settings
from django.db.models import Count, Max from django.db.models import Count, Max
from django.http import HttpResponse, HttpResponseBadRequest, Http404 from django.http import HttpResponse, HttpResponseBadRequest, Http404
@ -160,16 +163,49 @@ class DocumentViewSet(RetrieveModelMixin,
disposition, filename) disposition, filename)
return response return response
def get_metadata(self, file, type):
if not os.path.isfile(file):
return None
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
if type == 'application/pdf':
pdf = pikepdf.open(file)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
result.append({
"namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2),
"value": value
})
except Exception as e:
logging.getLogger(__name__).warning(
f"Error while reading metadata {key}: {value}. Error: "
f"{e}"
)
return result
@action(methods=['get'], detail=True) @action(methods=['get'], detail=True)
def metadata(self, request, pk=None): def metadata(self, request, pk=None):
try: try:
doc = Document.objects.get(pk=pk) doc = Document.objects.get(pk=pk)
return Response({ return Response({
"paperless__checksum": doc.checksum, "original_checksum": doc.checksum,
"paperless__mime_type": doc.mime_type, "archived_checksum": doc.archive_checksum,
"paperless__filename": doc.filename, "original_mime_type": doc.mime_type,
"paperless__has_archive_version": "media_filename": doc.filename,
os.path.isfile(doc.archive_path) "has_archive_version": os.path.isfile(doc.archive_path),
"original_metadata": self.get_metadata(
doc.source_path, doc.mime_type),
"archive_metadata": self.get_metadata(
doc.archive_path, "application/pdf")
}) })
except Document.DoesNotExist: except Document.DoesNotExist:
raise Http404() raise Http404()