mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
supply file_name for tika parser
This commit is contained in:
parent
de32addf76
commit
40ef375c15
@ -1,6 +1,5 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
@ -130,7 +129,7 @@ class Consumer(LoggingMixin):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
self.log("debug", "Parsing {}...".format(self.filename))
|
self.log("debug", "Parsing {}...".format(self.filename))
|
||||||
document_parser.parse(self.path, mime_type)
|
document_parser.parse(self.path, mime_type, self.filename)
|
||||||
|
|
||||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||||
thumbnail = document_parser.get_optimised_thumbnail(
|
thumbnail = document_parser.get_optimised_thumbnail(
|
||||||
|
@ -267,7 +267,7 @@ class DocumentParser(LoggingMixin):
|
|||||||
def extract_metadata(self, document_path, mime_type):
|
def extract_metadata(self, document_path, mime_type):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def get_archive_path(self):
|
def get_archive_path(self):
|
||||||
|
@ -177,7 +177,7 @@ class DummyParser(DocumentParser):
|
|||||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||||
return self.fake_thumb
|
return self.fake_thumb
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
self.text = "The Text"
|
self.text = "The Text"
|
||||||
|
|
||||||
|
|
||||||
@ -194,7 +194,7 @@ class FaultyParser(DocumentParser):
|
|||||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||||
return self.fake_thumb
|
return self.fake_thumb
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
raise ParseError("Does not compute.")
|
raise ParseError("Does not compute.")
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,7 +88,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
f"Error while calculating DPI for image {image}: {e}")
|
f"Error while calculating DPI for image {image}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
mode = settings.OCR_MODE
|
mode = settings.OCR_MODE
|
||||||
|
|
||||||
text_original = get_text_from_pdf(document_path)
|
text_original = get_text_from_pdf(document_path)
|
||||||
|
@ -32,6 +32,6 @@ class TextDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
return out_path
|
return out_path
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
with open(document_path, 'r') as f:
|
with open(document_path, 'r') as f:
|
||||||
self.text = f.read()
|
self.text = f.read()
|
||||||
|
@ -39,7 +39,7 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
} for key in parsed['metadata']
|
} for key in parsed['metadata']
|
||||||
]
|
]
|
||||||
|
|
||||||
def parse(self, document_path, mime_type):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
self.log("info", f"Sending {document_path} to Tika server")
|
self.log("info", f"Sending {document_path} to Tika server")
|
||||||
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
|
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
|
||||||
|
|
||||||
@ -60,15 +60,15 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
self.log("warning", f"Unable to extract date for document "
|
self.log("warning", f"Unable to extract date for document "
|
||||||
f"{document_path}: {e}")
|
f"{document_path}: {e}")
|
||||||
|
|
||||||
self.archive_path = self.convert_to_pdf(document_path)
|
self.archive_path = self.convert_to_pdf(document_path, file_name)
|
||||||
|
|
||||||
def convert_to_pdf(self, document_path):
|
def convert_to_pdf(self, document_path, file_name):
|
||||||
pdf_path = os.path.join(self.tempdir, "convert.pdf")
|
pdf_path = os.path.join(self.tempdir, "convert.pdf")
|
||||||
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
|
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
|
||||||
url = gotenberg_server + "/convert/office"
|
url = gotenberg_server + "/convert/office"
|
||||||
|
|
||||||
self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
|
self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
|
||||||
files = {"files": open(document_path, "rb")}
|
files = {"files": (file_name, open(document_path, "rb"))}
|
||||||
headers = {}
|
headers = {}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user