mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-01 11:19:32 -05:00
moved metadata extraction to the parsers
This commit is contained in:
parent
2f5614c04b
commit
0c6c4a62d8
@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin):
|
|||||||
def __init__(self, logging_group):
|
def __init__(self, logging_group):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.logging_group = logging_group
|
self.logging_group = logging_group
|
||||||
|
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||||
self.tempdir = tempfile.mkdtemp(
|
self.tempdir = tempfile.mkdtemp(
|
||||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||||
|
|
||||||
@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin):
|
|||||||
self.text = None
|
self.text = None
|
||||||
self.date = None
|
self.date = None
|
||||||
|
|
||||||
|
def extract_metadata(self, document_path, mime_type):
|
||||||
|
return []
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@ -1,11 +1,8 @@
|
|||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import tempfile
|
import tempfile
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from time import mktime
|
from time import mktime
|
||||||
|
|
||||||
import pikepdf
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.db.models import Count, Max
|
from django.db.models import Count, Max
|
||||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
||||||
@ -42,6 +39,7 @@ from .filters import (
|
|||||||
LogFilterSet
|
LogFilterSet
|
||||||
)
|
)
|
||||||
from .models import Correspondent, Document, Log, Tag, DocumentType
|
from .models import Correspondent, Document, Log, Tag, DocumentType
|
||||||
|
from .parsers import get_parser_class_for_mime_type
|
||||||
from .serialisers import (
|
from .serialisers import (
|
||||||
CorrespondentSerializer,
|
CorrespondentSerializer,
|
||||||
DocumentSerializer,
|
DocumentSerializer,
|
||||||
@ -163,34 +161,16 @@ class DocumentViewSet(RetrieveModelMixin,
|
|||||||
disposition, filename)
|
disposition, filename)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def get_metadata(self, file, type):
|
def get_metadata(self, file, mime_type):
|
||||||
if not os.path.isfile(file):
|
if not os.path.isfile(file):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||||
|
if parser_class:
|
||||||
result = []
|
parser = parser_class(logging_group=None)
|
||||||
if type == 'application/pdf':
|
return parser.extract_metadata(file, mime_type)
|
||||||
pdf = pikepdf.open(file)
|
else:
|
||||||
meta = pdf.open_metadata()
|
return []
|
||||||
for key, value in meta.items():
|
|
||||||
if isinstance(value, list):
|
|
||||||
value = " ".join([str(e) for e in value])
|
|
||||||
value = str(value)
|
|
||||||
try:
|
|
||||||
m = namespace_pattern.match(key)
|
|
||||||
result.append({
|
|
||||||
"namespace": m.group(1),
|
|
||||||
"prefix": meta.REVERSE_NS[m.group(1)],
|
|
||||||
"key": m.group(2),
|
|
||||||
"value": value
|
|
||||||
})
|
|
||||||
except Exception as e:
|
|
||||||
logging.getLogger(__name__).warning(
|
|
||||||
f"Error while reading metadata {key}: {value}. Error: "
|
|
||||||
f"{e}"
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
@action(methods=['get'], detail=True)
|
@action(methods=['get'], detail=True)
|
||||||
def metadata(self, request, pk=None):
|
def metadata(self, request, pk=None):
|
||||||
|
@ -5,6 +5,7 @@ import subprocess
|
|||||||
|
|
||||||
import ocrmypdf
|
import ocrmypdf
|
||||||
import pdftotext
|
import pdftotext
|
||||||
|
import pikepdf
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||||
@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def extract_metadata(self, document_path, mime_type):
|
||||||
|
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
|
||||||
|
|
||||||
|
result = []
|
||||||
|
if mime_type == 'application/pdf':
|
||||||
|
pdf = pikepdf.open(document_path)
|
||||||
|
meta = pdf.open_metadata()
|
||||||
|
for key, value in meta.items():
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join([str(e) for e in value])
|
||||||
|
value = str(value)
|
||||||
|
try:
|
||||||
|
m = namespace_pattern.match(key)
|
||||||
|
result.append({
|
||||||
|
"namespace": m.group(1),
|
||||||
|
"prefix": meta.REVERSE_NS[m.group(1)],
|
||||||
|
"key": m.group(2),
|
||||||
|
"value": value
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
self.log(
|
||||||
|
"warning",
|
||||||
|
f"Error while reading metadata {key}: {value}. Error: "
|
||||||
|
f"{e}"
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
def get_thumbnail(self, document_path, mime_type):
|
def get_thumbnail(self, document_path, mime_type):
|
||||||
"""
|
"""
|
||||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user