Uli Fahrer 2dcacaee14 fix(tika): adapt to Gotenberg 7 API
This commit adapts to the latest breaking changes from Gotenberg 7.
It also freezes the usage of the Gotenberg server to v7.x. Doing
this prevents further breaking changes leaking in our code base.

* refs #1250
2021-08-27 08:32:16 +02:00

90 lines
2.9 KiB
Python

import os
import requests
import dateutil.parser
from django.conf import settings
from documents.parsers import DocumentParser, ParseError, \
make_thumbnail_from_pdf
from tika import parser
class TikaDocumentParser(DocumentParser):
"""
This parser sends documents to a local tika server
"""
logging_name = "paperless.parsing.tika"
def get_thumbnail(self, document_path, mime_type, file_name=None):
if not self.archive_path:
self.archive_path = self.convert_to_pdf(document_path, file_name)
return make_thumbnail_from_pdf(
self.archive_path, self.tempdir, self.logging_group)
def extract_metadata(self, document_path, mime_type):
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as e:
self.log("warning", f"Error while fetching document metadata for "
f"{document_path}: {e}")
return []
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed['metadata'][key]
} for key in parsed['metadata']
]
def parse(self, document_path, mime_type, file_name=None):
self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}"
)
self.text = parsed["content"].strip()
try:
self.date = dateutil.parser.isoparse(
parsed["metadata"]["Creation-Date"])
except Exception as e:
self.log("warning", f"Unable to extract date for document "
f"{document_path}: {e}")
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name):
pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/forms/libreoffice/convert"
self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
files = {"files": (file_name or os.path.basename(document_path),
open(document_path, "rb"))}
headers = {}
try:
response = requests.post(url, files=files, headers=headers)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}"
)
file = open(pdf_path, "wb")
file.write(response.content)
file.close()
return pdf_path