mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	moved metadata extraction to the parsers
This commit is contained in:
		@@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin):
 | 
			
		||||
    def __init__(self, logging_group):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        self.logging_group = logging_group
 | 
			
		||||
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
 | 
			
		||||
        self.tempdir = tempfile.mkdtemp(
 | 
			
		||||
            prefix="paperless-", dir=settings.SCRATCH_DIR)
 | 
			
		||||
 | 
			
		||||
@@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin):
 | 
			
		||||
        self.text = None
 | 
			
		||||
        self.date = None
 | 
			
		||||
 | 
			
		||||
    def extract_metadata(self, document_path, mime_type):
 | 
			
		||||
        return []
 | 
			
		||||
 | 
			
		||||
    def parse(self, document_path, mime_type):
 | 
			
		||||
        raise NotImplementedError()
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,11 +1,8 @@
 | 
			
		||||
import logging
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import tempfile
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
from time import mktime
 | 
			
		||||
 | 
			
		||||
import pikepdf
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.db.models import Count, Max
 | 
			
		||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
 | 
			
		||||
@@ -42,6 +39,7 @@ from .filters import (
 | 
			
		||||
    LogFilterSet
 | 
			
		||||
)
 | 
			
		||||
from .models import Correspondent, Document, Log, Tag, DocumentType
 | 
			
		||||
from .parsers import get_parser_class_for_mime_type
 | 
			
		||||
from .serialisers import (
 | 
			
		||||
    CorrespondentSerializer,
 | 
			
		||||
    DocumentSerializer,
 | 
			
		||||
@@ -163,34 +161,16 @@ class DocumentViewSet(RetrieveModelMixin,
 | 
			
		||||
            disposition, filename)
 | 
			
		||||
        return response
 | 
			
		||||
 | 
			
		||||
    def get_metadata(self, file, type):
 | 
			
		||||
    def get_metadata(self, file, mime_type):
 | 
			
		||||
        if not os.path.isfile(file):
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        namespace_pattern = re.compile(r"\{(.*)\}(.*)")
 | 
			
		||||
 | 
			
		||||
        result = []
 | 
			
		||||
        if type == 'application/pdf':
 | 
			
		||||
            pdf = pikepdf.open(file)
 | 
			
		||||
            meta = pdf.open_metadata()
 | 
			
		||||
            for key, value in meta.items():
 | 
			
		||||
                if isinstance(value, list):
 | 
			
		||||
                    value = " ".join([str(e) for e in value])
 | 
			
		||||
                value = str(value)
 | 
			
		||||
                try:
 | 
			
		||||
                    m = namespace_pattern.match(key)
 | 
			
		||||
                    result.append({
 | 
			
		||||
                        "namespace": m.group(1),
 | 
			
		||||
                        "prefix": meta.REVERSE_NS[m.group(1)],
 | 
			
		||||
                        "key": m.group(2),
 | 
			
		||||
                        "value": value
 | 
			
		||||
                    })
 | 
			
		||||
                except Exception as e:
 | 
			
		||||
                    logging.getLogger(__name__).warning(
 | 
			
		||||
                        f"Error while reading metadata {key}: {value}. Error: "
 | 
			
		||||
                        f"{e}"
 | 
			
		||||
                    )
 | 
			
		||||
        return result
 | 
			
		||||
        parser_class = get_parser_class_for_mime_type(mime_type)
 | 
			
		||||
        if parser_class:
 | 
			
		||||
            parser = parser_class(logging_group=None)
 | 
			
		||||
            return parser.extract_metadata(file, mime_type)
 | 
			
		||||
        else:
 | 
			
		||||
            return []
 | 
			
		||||
 | 
			
		||||
    @action(methods=['get'], detail=True)
 | 
			
		||||
    def metadata(self, request, pk=None):
 | 
			
		||||
 
 | 
			
		||||
@@ -5,6 +5,7 @@ import subprocess
 | 
			
		||||
 | 
			
		||||
import ocrmypdf
 | 
			
		||||
import pdftotext
 | 
			
		||||
import pikepdf
 | 
			
		||||
from PIL import Image
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from ocrmypdf import InputFileError, EncryptedPdfError
 | 
			
		||||
@@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def extract_metadata(self, document_path, mime_type):
 | 
			
		||||
        namespace_pattern = re.compile(r"\{(.*)\}(.*)")
 | 
			
		||||
 | 
			
		||||
        result = []
 | 
			
		||||
        if mime_type == 'application/pdf':
 | 
			
		||||
            pdf = pikepdf.open(document_path)
 | 
			
		||||
            meta = pdf.open_metadata()
 | 
			
		||||
            for key, value in meta.items():
 | 
			
		||||
                if isinstance(value, list):
 | 
			
		||||
                    value = " ".join([str(e) for e in value])
 | 
			
		||||
                value = str(value)
 | 
			
		||||
                try:
 | 
			
		||||
                    m = namespace_pattern.match(key)
 | 
			
		||||
                    result.append({
 | 
			
		||||
                        "namespace": m.group(1),
 | 
			
		||||
                        "prefix": meta.REVERSE_NS[m.group(1)],
 | 
			
		||||
                        "key": m.group(2),
 | 
			
		||||
                        "value": value
 | 
			
		||||
                    })
 | 
			
		||||
                except Exception as e:
 | 
			
		||||
                    self.log(
 | 
			
		||||
                        "warning",
 | 
			
		||||
                        f"Error while reading metadata {key}: {value}. Error: "
 | 
			
		||||
                        f"{e}"
 | 
			
		||||
                    )
 | 
			
		||||
        return result
 | 
			
		||||
 | 
			
		||||
    def get_thumbnail(self, document_path, mime_type):
 | 
			
		||||
        """
 | 
			
		||||
        The thumbnail of a PDF is just a 500px wide image of the first page.
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user