diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 5a06194b7..abd12d802 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,6 +1,5 @@ import datetime import hashlib -import logging import os import magic @@ -130,7 +129,7 @@ class Consumer(LoggingMixin): try: self.log("debug", "Parsing {}...".format(self.filename)) - document_parser.parse(self.path, mime_type) + document_parser.parse(self.path, mime_type, self.filename) self.log("debug", f"Generating thumbnail for {self.filename}...") thumbnail = document_parser.get_optimised_thumbnail( diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 65d321ce5..e14607bd0 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -267,7 +267,7 @@ class DocumentParser(LoggingMixin): def extract_metadata(self, document_path, mime_type): return [] - def parse(self, document_path, mime_type): + def parse(self, document_path, mime_type, file_name=None): raise NotImplementedError() def get_archive_path(self): diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 795ca7f95..90c034f9e 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -177,7 +177,7 @@ class DummyParser(DocumentParser): def get_optimised_thumbnail(self, document_path, mime_type): return self.fake_thumb - def parse(self, document_path, mime_type): + def parse(self, document_path, mime_type, file_name=None): self.text = "The Text" @@ -194,7 +194,7 @@ class FaultyParser(DocumentParser): def get_optimised_thumbnail(self, document_path, mime_type): return self.fake_thumb - def parse(self, document_path, mime_type): + def parse(self, document_path, mime_type, file_name=None): raise ParseError("Does not compute.") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index fc8702eac..31e956284 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -88,7 +88,7 @@ class RasterisedDocumentParser(DocumentParser): f"Error while calculating DPI for image {image}: {e}") return None - def parse(self, document_path, mime_type): + def parse(self, document_path, mime_type, file_name=None): mode = settings.OCR_MODE text_original = get_text_from_pdf(document_path) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index a38bd7a91..c1afe07fc 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -32,6 +32,6 @@ class TextDocumentParser(DocumentParser): return out_path - def parse(self, document_path, mime_type): + def parse(self, document_path, mime_type, file_name=None): with open(document_path, 'r') as f: self.text = f.read() diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 56792fb51..13e937daa 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -39,7 +39,7 @@ class TikaDocumentParser(DocumentParser): } for key in parsed['metadata'] ] - def parse(self, document_path, mime_type): + def parse(self, document_path, mime_type, file_name=None): self.log("info", f"Sending {document_path} to Tika server") tika_server = settings.PAPERLESS_TIKA_ENDPOINT @@ -60,15 +60,15 @@ class TikaDocumentParser(DocumentParser): self.log("warning", f"Unable to extract date for document " f"{document_path}: {e}") - self.archive_path = self.convert_to_pdf(document_path) + self.archive_path = self.convert_to_pdf(document_path, file_name) - def convert_to_pdf(self, document_path): + def convert_to_pdf(self, document_path, file_name): pdf_path = os.path.join(self.tempdir, "convert.pdf") gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT url = gotenberg_server + "/convert/office" self.log("info", f"Converting {document_path} to PDF as {pdf_path}") - files = {"files": open(document_path, "rb")} + files = {"files": (file_name, open(document_path, "rb"))} headers = {} try: