diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 36ede3cce..228e2c86e 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin): def __init__(self, logging_group): super().__init__() self.logging_group = logging_group + os.makedirs(settings.SCRATCH_DIR, exist_ok=True) self.tempdir = tempfile.mkdtemp( prefix="paperless-", dir=settings.SCRATCH_DIR) @@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin): self.text = None self.date = None + def extract_metadata(self, document_path, mime_type): + return [] + def parse(self, document_path, mime_type): raise NotImplementedError() diff --git a/src/documents/views.py b/src/documents/views.py index 8dbb61dc7..b42ae1f96 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,11 +1,8 @@ -import logging import os -import re import tempfile from datetime import datetime from time import mktime -import pikepdf from django.conf import settings from django.db.models import Count, Max from django.http import HttpResponse, HttpResponseBadRequest, Http404 @@ -42,6 +39,7 @@ from .filters import ( LogFilterSet ) from .models import Correspondent, Document, Log, Tag, DocumentType +from .parsers import get_parser_class_for_mime_type from .serialisers import ( CorrespondentSerializer, DocumentSerializer, @@ -163,34 +161,16 @@ class DocumentViewSet(RetrieveModelMixin, disposition, filename) return response - def get_metadata(self, file, type): + def get_metadata(self, file, mime_type): if not os.path.isfile(file): return None - namespace_pattern = re.compile(r"\{(.*)\}(.*)") - - result = [] - if type == 'application/pdf': - pdf = pikepdf.open(file) - meta = pdf.open_metadata() - for key, value in meta.items(): - if isinstance(value, list): - value = " ".join([str(e) for e in value]) - value = str(value) - try: - m = namespace_pattern.match(key) - result.append({ - "namespace": m.group(1), - "prefix": meta.REVERSE_NS[m.group(1)], - "key": m.group(2), - "value": value - }) - except Exception as e: - logging.getLogger(__name__).warning( - f"Error while reading metadata {key}: {value}. Error: " - f"{e}" - ) - return result + parser_class = get_parser_class_for_mime_type(mime_type) + if parser_class: + parser = parser_class(logging_group=None) + return parser.extract_metadata(file, mime_type) + else: + return [] @action(methods=['get'], detail=True) def metadata(self, request, pk=None): diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index ebd706cdd..1cf6a769c 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -5,6 +5,7 @@ import subprocess import ocrmypdf import pdftotext +import pikepdf from PIL import Image from django.conf import settings from ocrmypdf import InputFileError, EncryptedPdfError @@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser): image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) """ + def extract_metadata(self, document_path, mime_type): + namespace_pattern = re.compile(r"\{(.*)\}(.*)") + + result = [] + if mime_type == 'application/pdf': + pdf = pikepdf.open(document_path) + meta = pdf.open_metadata() + for key, value in meta.items(): + if isinstance(value, list): + value = " ".join([str(e) for e in value]) + value = str(value) + try: + m = namespace_pattern.match(key) + result.append({ + "namespace": m.group(1), + "prefix": meta.REVERSE_NS[m.group(1)], + "key": m.group(2), + "value": value + }) + except Exception as e: + self.log( + "warning", + f"Error while reading metadata {key}: {value}. Error: " + f"{e}" + ) + return result + def get_thumbnail(self, document_path, mime_type): """ The thumbnail of a PDF is just a 500px wide image of the first page.