diff --git a/Pipfile b/Pipfile index edb0e46a9..8a4ea03b2 100644 --- a/Pipfile +++ b/Pipfile @@ -66,6 +66,7 @@ scipy = "==1.8.1" reportlab = "==3.6.12" # Pin this until piwheels is building a newer version (see https://www.piwheels.org/project/cryptography/) cryptography = "==40.0.1" +httpx = "*" [dev-packages] # Linting diff --git a/Pipfile.lock b/Pipfile.lock index 6bf949a7f..e9403c922 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "db3fc8c37931534327f89c6211581495328b6f6bf2c533df848fa23faa5d0cd3" + "sha256": "e63cdbb928210fc4dcf0554bde381abd0ff956923ae03ab9f6984025cd5a454d" }, "pipfile-spec": 6, "requires": {}, @@ -712,7 +712,7 @@ "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd", "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd" ], - "markers": "python_version >= '3.7'", + "index": "pypi", "version": "==0.24.1" }, "humanfriendly": { @@ -1746,11 +1746,11 @@ }, "tika-client": { "hashes": [ - "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0", - "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d" + "sha256:6110bd73eaa133f9c8eb1ef2566e6c0c8123a0e4efbcfb85b86f8c1b26cb4de2", + "sha256:e8eaa52771c72426f5531c53dcc8dfc5e3bb6e1f91f89fc93674a81bfca59d6d" ], "index": "pypi", - "version": "==0.1.1" + "version": "==0.2.0" }, "tornado": { "hashes": [ diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 1fcc89188..4365d21a4 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -13,7 +13,6 @@ from humanfriendly import format_size from imap_tools import MailAttachment from imap_tools import MailMessage from tika_client import TikaClient -from tika_client.data_models import TikaKey from documents.parsers import DocumentParser from documents.parsers import ParseError @@ -175,12 +174,8 @@ class MailDocumentParser(DocumentParser): with TikaClient(tika_url=self.tika_server) as client: parsed = client.tika.as_text.from_buffer(html, "text/html") - if hasattr(parsed, "content") and parsed.content is not None: + if parsed.content is not None: return parsed.content.strip() - elif TikaKey.Content in parsed.data: - # May not be a completely handled type, but - # the Tika response may still include content - return parsed.data[TikaKey.Content].strip() return "" except Exception as err: raise ParseError( diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 876696633..0558727f5 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -4,7 +4,6 @@ from pathlib import Path import httpx from django.conf import settings from tika_client import TikaClient -from tika_client.data_models import TikaKey from documents.parsers import DocumentParser from documents.parsers import ParseError @@ -59,13 +58,9 @@ class TikaDocumentParser(DocumentParser): f"{settings.TIKA_ENDPOINT}: {err}", ) from err - self.text = None - if hasattr(parsed, "content") and parsed.content is not None: - self.text = parsed.content.strip() - elif TikaKey.Content in parsed.data: - # May not be a completely handled type, but - # the Tika response may still include content - self.text = parsed.data[TikaKey.Content].strip() + self.text = parsed.content + if self.text is not None: + self.text = self.text.strip() self.date = parsed.created self.archive_path = self.convert_to_pdf(document_path, file_name)