Adds better error handling/checking around getting content of a document via Tika

Signed-off-by: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
This commit is contained in:
Trenton Holmes
2023-06-18 07:04:53 -07:00
committed by Trenton H
parent 4693632c7d
commit 4782b4da07
5 changed files with 45 additions and 7 deletions

View File

@@ -13,6 +13,7 @@ from humanfriendly import format_size
from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika_client import TikaClient
from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@@ -172,8 +173,12 @@ class MailDocumentParser(DocumentParser):
with TikaClient(tika_url=self.tika_server) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
if "X-TIKA:content" in parsed.data:
return parsed.data["X-TIKA:content"].strip()
if hasattr(parsed, "content") and parsed.content is not None:
return parsed.content.strip()
elif TikaKey.Content in parsed.data:
# May not be a completely handled type, but
# the Tika response may still include content
return parsed.data[TikaKey.Content].strip()
return ""
except Exception as err:
raise ParseError(