Working arround current TIKA Library Bugs

This commit is contained in:
Simon Siebert 2023-07-06 23:26:01 +02:00
parent db48d4c576
commit d875be60d4

View File

@ -1,8 +1,13 @@
import os import os
from pathlib import Path from pathlib import Path
import array
import dateutil.parser
import httpx import httpx
from django.conf import settings from django.conf import settings
from django.utils import timezone
from tika_client import TikaClient from tika_client import TikaClient
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
@ -51,7 +56,9 @@ class TikaDocumentParser(DocumentParser):
try: try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.tika.as_text.from_file(document_path, mime_type) with open(document_path, 'rb') as f:
content = f.read()
parsed = client.tika.as_text.from_buffer(content, mime_type)
except Exception as err: except Exception as err:
raise ParseError( raise ParseError(
f"Could not parse {document_path} with tika server at " f"Could not parse {document_path} with tika server at "
@ -62,7 +69,9 @@ class TikaDocumentParser(DocumentParser):
if self.text is not None: if self.text is not None:
self.text = self.text.strip() self.text = self.text.strip()
self.date = parsed.created tz = timezone.get_current_timezone()
self.date = timezone.make_aware(parsed.created,tz)
self.archive_path = self.convert_to_pdf(document_path, file_name) self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name): def convert_to_pdf(self, document_path, file_name):