mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-07 19:08:32 -05:00
Adds better error handling/checking around getting content of a document via Tika
Signed-off-by: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
This commit is contained in:

committed by
Trenton H

parent
4693632c7d
commit
4782b4da07
@@ -13,6 +13,7 @@ from humanfriendly import format_size
|
||||
from imap_tools import MailAttachment
|
||||
from imap_tools import MailMessage
|
||||
from tika_client import TikaClient
|
||||
from tika_client.data_models import TikaKey
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
@@ -172,8 +173,12 @@ class MailDocumentParser(DocumentParser):
|
||||
with TikaClient(tika_url=self.tika_server) as client:
|
||||
parsed = client.tika.as_text.from_buffer(html, "text/html")
|
||||
|
||||
if "X-TIKA:content" in parsed.data:
|
||||
return parsed.data["X-TIKA:content"].strip()
|
||||
if hasattr(parsed, "content") and parsed.content is not None:
|
||||
return parsed.content.strip()
|
||||
elif TikaKey.Content in parsed.data:
|
||||
# May not be a completely handled type, but
|
||||
# the Tika response may still include content
|
||||
return parsed.data[TikaKey.Content].strip()
|
||||
return ""
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
|
Reference in New Issue
Block a user