Working arround current TIKA Library Bugs - lint

This commit is contained in:
Simon Siebert 2023-07-06 23:31:38 +02:00 committed by Trenton Holmes
parent d875be60d4
commit 56fcb3fee1

View File

@ -1,9 +1,6 @@
import os import os
from pathlib import Path from pathlib import Path
import array
import dateutil.parser
import httpx import httpx
from django.conf import settings from django.conf import settings
from django.utils import timezone from django.utils import timezone
@ -56,7 +53,7 @@ class TikaDocumentParser(DocumentParser):
try: try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client: with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
with open(document_path, 'rb') as f: with open(document_path, "rb") as f:
content = f.read() content = f.read()
parsed = client.tika.as_text.from_buffer(content, mime_type) parsed = client.tika.as_text.from_buffer(content, mime_type)
except Exception as err: except Exception as err:
@ -71,7 +68,7 @@ class TikaDocumentParser(DocumentParser):
tz = timezone.get_current_timezone() tz = timezone.get_current_timezone()
self.date = timezone.make_aware(parsed.created,tz) self.date = timezone.make_aware(parsed.created, tz)
self.archive_path = self.convert_to_pdf(document_path, file_name) self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name): def convert_to_pdf(self, document_path, file_name):