moved metadata extraction to the parsers

This commit is contained in:
jonaswinkler 2020-12-10 14:57:53 +01:00
parent 0cc22017de
commit 2f7bb01f34
3 changed files with 40 additions and 28 deletions

View File

@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin):
def __init__(self, logging_group):
super().__init__()
self.logging_group = logging_group
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR)
@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin):
self.text = None
self.date = None
def extract_metadata(self, document_path, mime_type):
return []
def parse(self, document_path, mime_type):
raise NotImplementedError()

View File

@ -1,11 +1,8 @@
import logging
import os
import re
import tempfile
from datetime import datetime
from time import mktime
import pikepdf
from django.conf import settings
from django.db.models import Count, Max
from django.http import HttpResponse, HttpResponseBadRequest, Http404
@ -42,6 +39,7 @@ from .filters import (
LogFilterSet
)
from .models import Correspondent, Document, Log, Tag, DocumentType
from .parsers import get_parser_class_for_mime_type
from .serialisers import (
CorrespondentSerializer,
DocumentSerializer,
@ -163,34 +161,16 @@ class DocumentViewSet(RetrieveModelMixin,
disposition, filename)
return response
def get_metadata(self, file, type):
def get_metadata(self, file, mime_type):
if not os.path.isfile(file):
return None
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
if type == 'application/pdf':
pdf = pikepdf.open(file)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
result.append({
"namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2),
"value": value
})
except Exception as e:
logging.getLogger(__name__).warning(
f"Error while reading metadata {key}: {value}. Error: "
f"{e}"
)
return result
parser_class = get_parser_class_for_mime_type(mime_type)
if parser_class:
parser = parser_class(logging_group=None)
return parser.extract_metadata(file, mime_type)
else:
return []
@action(methods=['get'], detail=True)
def metadata(self, request, pk=None):

View File

@ -5,6 +5,7 @@ import subprocess
import ocrmypdf
import pdftotext
import pikepdf
from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError
@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser):
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
def extract_metadata(self, document_path, mime_type):
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
if mime_type == 'application/pdf':
pdf = pikepdf.open(document_path)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
result.append({
"namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2),
"value": value
})
except Exception as e:
self.log(
"warning",
f"Error while reading metadata {key}: {value}. Error: "
f"{e}"
)
return result
def get_thumbnail(self, document_path, mime_type):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.