Updates tika client library and handle the changes to it

This commit is contained in:
Trenton H 2023-06-26 07:41:45 -07:00
parent 0d6e79cb93
commit e05b3441de
4 changed files with 10 additions and 19 deletions

View File

@ -66,6 +66,7 @@ scipy = "==1.8.1"
reportlab = "==3.6.12"
# Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/)
cryptography = "==40.0.1"
httpx = "*"
[dev-packages]
# Linting

10
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3"
"sha256": "e63cdbb928210fc4dcf0554bde381abd0ff956923ae03ab9f6984025cd5a454d"
},
"pipfile-spec": 6,
"requires": {},
@ -712,7 +712,7 @@
"sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd",
"sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"
],
"markers": "python_version >= '3.7'",
"index": "pypi",
"version": "==0.24.1"
},
"humanfriendly": {
@ -1746,11 +1746,11 @@
},
"tika-client": {
"hashes": [
"sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
"sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
"sha256:6110bd73eaa133f9c8eb1ef2566e6c0c8123a0e4efbcfb85b86f8c1b26cb4de2",
"sha256:e8eaa52771c72426f5531c53dcc8dfc5e3bb6e1f91f89fc93674a81bfca59d6d"
],
"index": "pypi",
"version": "==0.1.1"
"version": "==0.2.0"
},
"tornado": {
"hashes": [

View File

@ -13,7 +13,6 @@ from humanfriendly import format_size
from imap_tools import MailAttachment
from imap_tools import MailMessage
from tika_client import TikaClient
from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser):
with TikaClient(tika_url=self.tika_server) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
if hasattr(parsed, "content") and parsed.content is not None:
if parsed.content is not None:
return parsed.content.strip()
elif TikaKey.Content in parsed.data:
# May not be a completely handled type, but
# the Tika response may still include content
return parsed.data[TikaKey.Content].strip()
return ""
except Exception as err:
raise ParseError(

View File

@ -4,7 +4,6 @@ from pathlib import Path
import httpx
from django.conf import settings
from tika_client import TikaClient
from tika_client.data_models import TikaKey
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser):
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
self.text = None
if hasattr(parsed, "content") and parsed.content is not None:
self.text = parsed.content.strip()
elif TikaKey.Content in parsed.data:
# May not be a completely handled type, but
# the Tika response may still include content
self.text = parsed.data[TikaKey.Content].strip()
self.text = parsed.content
if self.text is not None:
self.text = self.text.strip()
self.date = parsed.created
self.archive_path = self.convert_to_pdf(document_path, file_name)