mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
reading and displaying PDF metadata
This commit is contained in:
parent
9da11f29c7
commit
ad527fe97c
1
Pipfile
1
Pipfile
@ -27,6 +27,7 @@ langdetect = "*"
|
||||
pdftotext = "*"
|
||||
pathvalidate = "*"
|
||||
pillow = "*"
|
||||
pikepdf = "*"
|
||||
python-gnupg = "*"
|
||||
python-dotenv = "*"
|
||||
python-dateutil = "*"
|
||||
|
4
Pipfile.lock
generated
4
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "3c187671ead11714d48b56f4714b145f68814e09edea818610b87f18b4f7f6fd"
|
||||
"sha256": "3d576f289958226a7583e4c471c7f8c11bff6933bf093185f623cfb381a92412"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@ -433,7 +433,7 @@
|
||||
"sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
|
||||
"sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
|
||||
],
|
||||
"markers": "python_version < '3.9'",
|
||||
"index": "pypi",
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"pillow": {
|
||||
|
@ -15,7 +15,7 @@
|
||||
<span class="d-none d-lg-inline"> Download</span>
|
||||
</a>
|
||||
|
||||
<div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version">
|
||||
<div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.has_archive_version">
|
||||
<button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button>
|
||||
<div class="dropdown-menu" ngbDropdownMenu>
|
||||
<a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a>
|
||||
@ -72,6 +72,7 @@
|
||||
<li [ngbNavItem]="3">
|
||||
<a ngbNavLink>Metadata</a>
|
||||
<ng-template ngbNavContent>
|
||||
|
||||
<table class="table table-borderless">
|
||||
<tbody>
|
||||
<tr>
|
||||
@ -83,23 +84,76 @@
|
||||
<td>{{document.added | date}}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>MD5 Checksum</td>
|
||||
<td>{{metadata?.paperless__checksum}}</td>
|
||||
<td>Original MD5 Checksum</td>
|
||||
<td>{{metadata?.original_checksum}}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Archive MD5 Checksum</td>
|
||||
<td>{{metadata?.archived_checksum}}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Original mime type</td>
|
||||
<td>{{metadata?.paperless__mime_type}}</td>
|
||||
<td>{{metadata?.original_mime_type}}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Is archived?</td>
|
||||
<td>{{metadata?.paperless__has_archive_version | yesno}}</td>
|
||||
<td>{{metadata?.has_archive_version | yesno}}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Media filename</td>
|
||||
<td>{{metadata?.paperless__filename}}</td>
|
||||
<td>{{metadata?.media_filename}}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h6 *ngIf="metadata?.original_metadata.length > 0">
|
||||
<button type="button" class="btn btn-outline-secondary btn-sm mr-2"
|
||||
(click)="expandOriginalMetadata = !expandOriginalMetadata" aria-controls="collapseExample">
|
||||
<svg class="buttonicon" fill="currentColor" *ngIf="!expandOriginalMetadata">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#caret-down" />
|
||||
</svg>
|
||||
<svg class="buttonicon" fill="currentColor" *ngIf="expandOriginalMetadata">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#caret-up" />
|
||||
</svg>
|
||||
</button>
|
||||
Original document metadata
|
||||
</h6>
|
||||
|
||||
<div #collapse="ngbCollapse" [(ngbCollapse)]="!expandOriginalMetadata">
|
||||
<table class="table table-borderless">
|
||||
<tbody>
|
||||
<tr *ngFor="let m of metadata?.original_metadata">
|
||||
<td>{{m.prefix}}:{{m.key}}</td>
|
||||
<td>{{m.value}}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<h6 *ngIf="metadata?.has_archive_version && metadata?.archive_metadata.length > 0">
|
||||
<button type="button" class="btn btn-outline-secondary btn-sm mr-2"
|
||||
(click)="expandArchivedMetadata = !expandArchivedMetadata" aria-controls="collapseExample">
|
||||
<svg class="buttonicon" fill="currentColor" *ngIf="!expandArchivedMetadata">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#caret-down" />
|
||||
</svg>
|
||||
<svg class="buttonicon" fill="currentColor" *ngIf="expandArchivedMetadata">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#caret-up" />
|
||||
</svg>
|
||||
</button>
|
||||
Archived document metadata
|
||||
</h6>
|
||||
|
||||
<div #collapse="ngbCollapse" [(ngbCollapse)]="!expandArchivedMetadata">
|
||||
<table class="table table-borderless">
|
||||
<tbody>
|
||||
<tr *ngFor="let m of metadata?.archive_metadata">
|
||||
<td>{{m.prefix}}:{{m.key}}</td>
|
||||
<td>{{m.value}}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
</ng-template>
|
||||
</li>
|
||||
</ul>
|
||||
@ -107,7 +161,8 @@
|
||||
<div [ngbNavOutlet]="nav" class="mt-2"></div>
|
||||
|
||||
<button type="button" class="btn btn-outline-secondary" (click)="discard()">Discard</button>
|
||||
<button type="button" class="btn btn-outline-primary" (click)="saveEditNext()" *ngIf="hasNext()">Save & edit next</button>
|
||||
<button type="button" class="btn btn-outline-primary" (click)="saveEditNext()" *ngIf="hasNext()">Save & edit
|
||||
next</button>
|
||||
<button type="submit" class="btn btn-primary">Save</button>
|
||||
</form>
|
||||
</div>
|
||||
|
@ -24,6 +24,9 @@ import { DocumentTypeEditDialogComponent } from '../manage/document-type-list/do
|
||||
})
|
||||
export class DocumentDetailComponent implements OnInit {
|
||||
|
||||
public expandOriginalMetadata = false;
|
||||
public expandArchivedMetadata = false;
|
||||
|
||||
documentId: number
|
||||
document: PaperlessDocument
|
||||
metadata: PaperlessDocumentMetadata
|
||||
|
@ -1,11 +1,13 @@
|
||||
export interface PaperlessDocumentMetadata {
|
||||
|
||||
paperless__checksum?: string
|
||||
original_checksum?: string
|
||||
|
||||
paperless__mime_type?: string
|
||||
archived_checksum?: string
|
||||
|
||||
paperless__filename?: string
|
||||
original_mime_type?: string
|
||||
|
||||
paperless__has_archive_version?: boolean
|
||||
media_filename?: string
|
||||
|
||||
has_archive_version?: boolean
|
||||
|
||||
}
|
@ -1,4 +1,5 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
|
||||
@ -493,3 +494,34 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
||||
def test_get_metadata(self):
|
||||
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="image/png")
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), doc.source_path)
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.archive_path)
|
||||
|
||||
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
meta = response.data
|
||||
|
||||
self.assertEqual(meta['original_mime_type'], "image/png")
|
||||
self.assertTrue(meta['has_archive_version'])
|
||||
self.assertEqual(len(meta['original_metadata']), 0)
|
||||
self.assertGreater(len(meta['archive_metadata']), 0)
|
||||
|
||||
def test_get_metadata_no_archive(self):
|
||||
doc = Document.objects.create(title="test", filename="file.pdf", mime_type="application/pdf")
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), doc.source_path)
|
||||
|
||||
response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
meta = response.data
|
||||
|
||||
self.assertEqual(meta['original_mime_type'], "application/pdf")
|
||||
self.assertFalse(meta['has_archive_version'])
|
||||
self.assertGreater(len(meta['original_metadata']), 0)
|
||||
self.assertIsNone(meta['archive_metadata'])
|
||||
|
@ -1,8 +1,11 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
import pikepdf
|
||||
from django.conf import settings
|
||||
from django.db.models import Count, Max
|
||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
||||
@ -160,16 +163,49 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
disposition, filename)
|
||||
return response
|
||||
|
||||
def get_metadata(self, file, type):
|
||||
if not os.path.isfile(file):
|
||||
return None
|
||||
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
result = []
|
||||
if type == 'application/pdf':
|
||||
pdf = pikepdf.open(file)
|
||||
meta = pdf.open_metadata()
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
result.append({
|
||||
"namespace": m.group(1),
|
||||
"prefix": meta.REVERSE_NS[m.group(1)],
|
||||
"key": m.group(2),
|
||||
"value": value
|
||||
})
|
||||
except Exception as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Error while reading metadata {key}: {value}. Error: "
|
||||
f"{e}"
|
||||
)
|
||||
return result
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def metadata(self, request, pk=None):
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
return Response({
|
||||
"paperless__checksum": doc.checksum,
|
||||
"paperless__mime_type": doc.mime_type,
|
||||
"paperless__filename": doc.filename,
|
||||
"paperless__has_archive_version":
|
||||
os.path.isfile(doc.archive_path)
|
||||
"original_checksum": doc.checksum,
|
||||
"archived_checksum": doc.archive_checksum,
|
||||
"original_mime_type": doc.mime_type,
|
||||
"media_filename": doc.filename,
|
||||
"has_archive_version": os.path.isfile(doc.archive_path),
|
||||
"original_metadata": self.get_metadata(
|
||||
doc.source_path, doc.mime_type),
|
||||
"archive_metadata": self.get_metadata(
|
||||
doc.archive_path, "application/pdf")
|
||||
})
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
|
Loading…
x
Reference in New Issue
Block a user