Updates tika client library and handle the changes to it

This commit is contained in:
Trenton H
2023-06-26 07:41:45 -07:00
parent 0d6e79cb93
commit e05b3441de
4 changed files with 10 additions and 19 deletions

View File

@@ -13,7 +13,6 @@ from humanfriendly import format_size
from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika_client import TikaClient
from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser):
with TikaClient(tika_url=self.tika_server) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
if hasattr(parsed, "content") and parsed.content is not None:
if parsed.content is not None:
return parsed.content.strip()
elif TikaKey.Content in parsed.data:
# May not be a completely handled type, but
# the Tika response may still include content
return parsed.data[TikaKey.Content].strip()
return ""
except Exception as err:
raise ParseError(

View File

@@ -4,7 +4,6 @@ from pathlib import Path
import httpx
from django.conf import settings
from tika_client import TikaClient
from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser):
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
self.text = None
if hasattr(parsed, "content") and parsed.content is not None:
self.text = parsed.content.strip()
elif TikaKey.Content in parsed.data:
# May not be a completely handled type, but
# the Tika response may still include content
self.text = parsed.data[TikaKey.Content].strip()
self.text = parsed.content
if self.text is not None:
self.text = self.text.strip()
self.date = parsed.created
self.archive_path = self.convert_to_pdf(document_path, file_name)