moved metadata extraction to the parsers

2025-12-24 02:05:48 -06:00 · 2020-12-10 14:57:53 +01:00
parent 0cc22017de
commit 2f7bb01f34
3 changed files with 40 additions and 28 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -5,6 +5,7 @@ import subprocess

 import ocrmypdf
 import pdftotext
+import pikepdf
 from PIL import Image
 from django.conf import settings
 from ocrmypdf import InputFileError, EncryptedPdfError
@@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser):
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """

+    def extract_metadata(self, document_path, mime_type):
+        namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+        result = []
+        if mime_type == 'application/pdf':
+            pdf = pikepdf.open(document_path)
+            meta = pdf.open_metadata()
+            for key, value in meta.items():
+                if isinstance(value, list):
+                    value = " ".join([str(e) for e in value])
+                value = str(value)
+                try:
+                    m = namespace_pattern.match(key)
+                    result.append({
+                        "namespace": m.group(1),
+                        "prefix": meta.REVERSE_NS[m.group(1)],
+                        "key": m.group(2),
+                        "value": value
+                    })
+                except Exception as e:
+                    self.log(
+                        "warning",
+                        f"Error while reading metadata {key}: {value}. Error: "
+                        f"{e}"
+                    )
+        return result
+
    def get_thumbnail(self, document_path, mime_type):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.