mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
moved metadata extraction to the parsers
This commit is contained in:
parent
0cc22017de
commit
2f7bb01f34
@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin):
|
||||
def __init__(self, logging_group):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
self.tempdir = tempfile.mkdtemp(
|
||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
|
||||
@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin):
|
||||
self.text = None
|
||||
self.date = None
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
return []
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
@ -1,11 +1,8 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
import pikepdf
|
||||
from django.conf import settings
|
||||
from django.db.models import Count, Max
|
||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
||||
@ -42,6 +39,7 @@ from .filters import (
|
||||
LogFilterSet
|
||||
)
|
||||
from .models import Correspondent, Document, Log, Tag, DocumentType
|
||||
from .parsers import get_parser_class_for_mime_type
|
||||
from .serialisers import (
|
||||
CorrespondentSerializer,
|
||||
DocumentSerializer,
|
||||
@ -163,34 +161,16 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
disposition, filename)
|
||||
return response
|
||||
|
||||
def get_metadata(self, file, type):
|
||||
def get_metadata(self, file, mime_type):
|
||||
if not os.path.isfile(file):
|
||||
return None
|
||||
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
result = []
|
||||
if type == 'application/pdf':
|
||||
pdf = pikepdf.open(file)
|
||||
meta = pdf.open_metadata()
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
result.append({
|
||||
"namespace": m.group(1),
|
||||
"prefix": meta.REVERSE_NS[m.group(1)],
|
||||
"key": m.group(2),
|
||||
"value": value
|
||||
})
|
||||
except Exception as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Error while reading metadata {key}: {value}. Error: "
|
||||
f"{e}"
|
||||
)
|
||||
return result
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if parser_class:
|
||||
parser = parser_class(logging_group=None)
|
||||
return parser.extract_metadata(file, mime_type)
|
||||
else:
|
||||
return []
|
||||
|
||||
@action(methods=['get'], detail=True)
|
||||
def metadata(self, request, pk=None):
|
||||
|
@ -5,6 +5,7 @@ import subprocess
|
||||
|
||||
import ocrmypdf
|
||||
import pdftotext
|
||||
import pikepdf
|
||||
from PIL import Image
|
||||
from django.conf import settings
|
||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||
@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||
|
||||
result = []
|
||||
if mime_type == 'application/pdf':
|
||||
pdf = pikepdf.open(document_path)
|
||||
meta = pdf.open_metadata()
|
||||
for key, value in meta.items():
|
||||
if isinstance(value, list):
|
||||
value = " ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = namespace_pattern.match(key)
|
||||
result.append({
|
||||
"namespace": m.group(1),
|
||||
"prefix": meta.REVERSE_NS[m.group(1)],
|
||||
"key": m.group(2),
|
||||
"value": value
|
||||
})
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Error while reading metadata {key}: {value}. Error: "
|
||||
f"{e}"
|
||||
)
|
||||
return result
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
|
Loading…
x
Reference in New Issue
Block a user