supply file_name for tika parser

This commit is contained in:
jonaswinkler 2021-01-01 22:19:43 +01:00
parent de32addf76
commit 40ef375c15
6 changed files with 10 additions and 11 deletions

View File

@ -1,6 +1,5 @@
import datetime import datetime
import hashlib import hashlib
import logging
import os import os
import magic import magic
@ -130,7 +129,7 @@ class Consumer(LoggingMixin):
try: try:
self.log("debug", "Parsing {}...".format(self.filename)) self.log("debug", "Parsing {}...".format(self.filename))
document_parser.parse(self.path, mime_type) document_parser.parse(self.path, mime_type, self.filename)
self.log("debug", f"Generating thumbnail for {self.filename}...") self.log("debug", f"Generating thumbnail for {self.filename}...")
thumbnail = document_parser.get_optimised_thumbnail( thumbnail = document_parser.get_optimised_thumbnail(

View File

@ -267,7 +267,7 @@ class DocumentParser(LoggingMixin):
def extract_metadata(self, document_path, mime_type): def extract_metadata(self, document_path, mime_type):
return [] return []
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type, file_name=None):
raise NotImplementedError() raise NotImplementedError()
def get_archive_path(self): def get_archive_path(self):

View File

@ -177,7 +177,7 @@ class DummyParser(DocumentParser):
def get_optimised_thumbnail(self, document_path, mime_type): def get_optimised_thumbnail(self, document_path, mime_type):
return self.fake_thumb return self.fake_thumb
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type, file_name=None):
self.text = "The Text" self.text = "The Text"
@ -194,7 +194,7 @@ class FaultyParser(DocumentParser):
def get_optimised_thumbnail(self, document_path, mime_type): def get_optimised_thumbnail(self, document_path, mime_type):
return self.fake_thumb return self.fake_thumb
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type, file_name=None):
raise ParseError("Does not compute.") raise ParseError("Does not compute.")

View File

@ -88,7 +88,7 @@ class RasterisedDocumentParser(DocumentParser):
f"Error while calculating DPI for image {image}: {e}") f"Error while calculating DPI for image {image}: {e}")
return None return None
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type, file_name=None):
mode = settings.OCR_MODE mode = settings.OCR_MODE
text_original = get_text_from_pdf(document_path) text_original = get_text_from_pdf(document_path)

View File

@ -32,6 +32,6 @@ class TextDocumentParser(DocumentParser):
return out_path return out_path
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type, file_name=None):
with open(document_path, 'r') as f: with open(document_path, 'r') as f:
self.text = f.read() self.text = f.read()

View File

@ -39,7 +39,7 @@ class TikaDocumentParser(DocumentParser):
} for key in parsed['metadata'] } for key in parsed['metadata']
] ]
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type, file_name=None):
self.log("info", f"Sending {document_path} to Tika server") self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT tika_server = settings.PAPERLESS_TIKA_ENDPOINT
@ -60,15 +60,15 @@ class TikaDocumentParser(DocumentParser):
self.log("warning", f"Unable to extract date for document " self.log("warning", f"Unable to extract date for document "
f"{document_path}: {e}") f"{document_path}: {e}")
self.archive_path = self.convert_to_pdf(document_path) self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path): def convert_to_pdf(self, document_path, file_name):
pdf_path = os.path.join(self.tempdir, "convert.pdf") pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/convert/office" url = gotenberg_server + "/convert/office"
self.log("info", f"Converting {document_path} to PDF as {pdf_path}") self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
files = {"files": open(document_path, "rb")} files = {"files": (file_name, open(document_path, "rb"))}
headers = {} headers = {}
try: try: