moved metadata extraction to the parsers

This commit is contained in:
jonaswinkler
2020-12-10 14:57:53 +01:00
parent 0cc22017de
commit 2f7bb01f34
3 changed files with 40 additions and 28 deletions

View File

@@ -5,6 +5,7 @@ import subprocess
import ocrmypdf
import pdftotext
import pikepdf
from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError
@@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser):
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
def extract_metadata(self, document_path, mime_type):
namespace_pattern = re.compile(r"\{(.*)\}(.*)")
result = []
if mime_type == 'application/pdf':
pdf = pikepdf.open(document_path)
meta = pdf.open_metadata()
for key, value in meta.items():
if isinstance(value, list):
value = " ".join([str(e) for e in value])
value = str(value)
try:
m = namespace_pattern.match(key)
result.append({
"namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2),
"value": value
})
except Exception as e:
self.log(
"warning",
f"Error while reading metadata {key}: {value}. Error: "
f"{e}"
)
return result
def get_thumbnail(self, document_path, mime_type):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.