mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	moved metadata extraction to the parsers
This commit is contained in:
		| @@ -210,6 +210,7 @@ class DocumentParser(LoggingMixin): | ||||
|     def __init__(self, logging_group): | ||||
|         super().__init__() | ||||
|         self.logging_group = logging_group | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|         self.tempdir = tempfile.mkdtemp( | ||||
|             prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|  | ||||
| @@ -217,6 +218,9 @@ class DocumentParser(LoggingMixin): | ||||
|         self.text = None | ||||
|         self.date = None | ||||
|  | ||||
|     def extract_metadata(self, document_path, mime_type): | ||||
|         return [] | ||||
|  | ||||
|     def parse(self, document_path, mime_type): | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|   | ||||
| @@ -1,11 +1,8 @@ | ||||
| import logging | ||||
| import os | ||||
| import re | ||||
| import tempfile | ||||
| from datetime import datetime | ||||
| from time import mktime | ||||
|  | ||||
| import pikepdf | ||||
| from django.conf import settings | ||||
| from django.db.models import Count, Max | ||||
| from django.http import HttpResponse, HttpResponseBadRequest, Http404 | ||||
| @@ -42,6 +39,7 @@ from .filters import ( | ||||
|     LogFilterSet | ||||
| ) | ||||
| from .models import Correspondent, Document, Log, Tag, DocumentType | ||||
| from .parsers import get_parser_class_for_mime_type | ||||
| from .serialisers import ( | ||||
|     CorrespondentSerializer, | ||||
|     DocumentSerializer, | ||||
| @@ -163,34 +161,16 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|             disposition, filename) | ||||
|         return response | ||||
|  | ||||
|     def get_metadata(self, file, type): | ||||
|     def get_metadata(self, file, mime_type): | ||||
|         if not os.path.isfile(file): | ||||
|             return None | ||||
|  | ||||
|         namespace_pattern = re.compile(r"\{(.*)\}(.*)") | ||||
|  | ||||
|         result = [] | ||||
|         if type == 'application/pdf': | ||||
|             pdf = pikepdf.open(file) | ||||
|             meta = pdf.open_metadata() | ||||
|             for key, value in meta.items(): | ||||
|                 if isinstance(value, list): | ||||
|                     value = " ".join([str(e) for e in value]) | ||||
|                 value = str(value) | ||||
|                 try: | ||||
|                     m = namespace_pattern.match(key) | ||||
|                     result.append({ | ||||
|                         "namespace": m.group(1), | ||||
|                         "prefix": meta.REVERSE_NS[m.group(1)], | ||||
|                         "key": m.group(2), | ||||
|                         "value": value | ||||
|                     }) | ||||
|                 except Exception as e: | ||||
|                     logging.getLogger(__name__).warning( | ||||
|                         f"Error while reading metadata {key}: {value}. Error: " | ||||
|                         f"{e}" | ||||
|                     ) | ||||
|         return result | ||||
|         parser_class = get_parser_class_for_mime_type(mime_type) | ||||
|         if parser_class: | ||||
|             parser = parser_class(logging_group=None) | ||||
|             return parser.extract_metadata(file, mime_type) | ||||
|         else: | ||||
|             return [] | ||||
|  | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def metadata(self, request, pk=None): | ||||
|   | ||||
| @@ -5,6 +5,7 @@ import subprocess | ||||
|  | ||||
| import ocrmypdf | ||||
| import pdftotext | ||||
| import pikepdf | ||||
| from PIL import Image | ||||
| from django.conf import settings | ||||
| from ocrmypdf import InputFileError, EncryptedPdfError | ||||
| @@ -18,6 +19,33 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||
|     """ | ||||
|  | ||||
|     def extract_metadata(self, document_path, mime_type): | ||||
|         namespace_pattern = re.compile(r"\{(.*)\}(.*)") | ||||
|  | ||||
|         result = [] | ||||
|         if mime_type == 'application/pdf': | ||||
|             pdf = pikepdf.open(document_path) | ||||
|             meta = pdf.open_metadata() | ||||
|             for key, value in meta.items(): | ||||
|                 if isinstance(value, list): | ||||
|                     value = " ".join([str(e) for e in value]) | ||||
|                 value = str(value) | ||||
|                 try: | ||||
|                     m = namespace_pattern.match(key) | ||||
|                     result.append({ | ||||
|                         "namespace": m.group(1), | ||||
|                         "prefix": meta.REVERSE_NS[m.group(1)], | ||||
|                         "key": m.group(2), | ||||
|                         "value": value | ||||
|                     }) | ||||
|                 except Exception as e: | ||||
|                     self.log( | ||||
|                         "warning", | ||||
|                         f"Error while reading metadata {key}: {value}. Error: " | ||||
|                         f"{e}" | ||||
|                     ) | ||||
|         return result | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler