moved metadata extraction to the parsers

This commit is contained in:
jonaswinkler 2020-12-10 14:57:53 +01:00
parent 2f5614c04b
commit 0c6c4a62d8
3 changed files with 40 additions and 28 deletions

View File

@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin):
def __init__(self, logging_group): def __init__(self, logging_group):
super().__init__() super().__init__()
self.logging_group = logging_group self.logging_group = logging_group
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
self.tempdir = tempfile.mkdtemp( self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR) prefix="paperless-", dir=settings.SCRATCH_DIR)
@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin):
self.text = None self.text = None
self.date = None self.date = None
def extract_metadata(self, document_path, mime_type):
return []
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type):
raise NotImplementedError() raise NotImplementedError()

View File

@ -1,11 +1,8 @@
import logging
import os import os
import re
import tempfile import tempfile
from datetime import datetime from datetime import datetime
from time import mktime from time import mktime
import pikepdf
from django.conf import settings from django.conf import settings
from django.db.models import Count, Max from django.db.models import Count, Max
from django.http import HttpResponse, HttpResponseBadRequest, Http404 from django.http import HttpResponse, HttpResponseBadRequest, Http404
@ -42,6 +39,7 @@ from .filters import (
LogFilterSet LogFilterSet
) )
from .models import Correspondent, Document, Log, Tag, DocumentType from .models import Correspondent, Document, Log, Tag, DocumentType
from .parsers import get_parser_class_for_mime_type
from .serialisers import ( from .serialisers import (
CorrespondentSerializer, CorrespondentSerializer,
DocumentSerializer, DocumentSerializer,
@ -163,34 +161,16 @@ class DocumentViewSet(RetrieveModelMixin,
disposition, filename) disposition, filename)
return response return response
def get_metadata(self, file, type): def get_metadata(self, file, mime_type):
if not os.path.isfile(file): if not os.path.isfile(file):
return None return None
namespace_pattern = re.compile(r"\{(.*)\}(.*)") parser_class = get_parser_class_for_mime_type(mime_type)
if parser_class:
result = [] parser = parser_class(logging_group=None)
if type == 'application/pdf': return parser.extract_metadata(file, mime_type)
pdf = pikepdf.open(file) else:
meta = pdf.open_metadata() return []
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
result.append({
"namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2),
"value": value
})
except Exception as e:
logging.getLogger(__name__).warning(
f"Error while reading metadata {key}: {value}. Error: "
f"{e}"
)
return result
@action(methods=['get'], detail=True) @action(methods=['get'], detail=True)
def metadata(self, request, pk=None): def metadata(self, request, pk=None):

View File

@ -5,6 +5,7 @@ import subprocess
import ocrmypdf import ocrmypdf
import pdftotext import pdftotext
import pikepdf
from PIL import Image from PIL import Image
from django.conf import settings from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError from ocrmypdf import InputFileError, EncryptedPdfError
@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser):
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
""" """
def extract_metadata(self, document_path, mime_type):
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
if mime_type == 'application/pdf':
pdf = pikepdf.open(document_path)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
result.append({
"namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2),
"value": value
})
except Exception as e:
self.log(
"warning",
f"Error while reading metadata {key}: {value}. Error: "
f"{e}"
)
return result
def get_thumbnail(self, document_path, mime_type): def get_thumbnail(self, document_path, mime_type):
""" """
The thumbnail of a PDF is just a 500px wide image of the first page. The thumbnail of a PDF is just a 500px wide image of the first page.