diff --git a/Pipfile.lock b/Pipfile.lock index d948729ef..6bf949a7f 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1746,11 +1746,11 @@ }, "tika-client": { "hashes": [ - "sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156", - "sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610" + "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0", + "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d" ], "index": "pypi", - "version": "==0.0.3" + "version": "==0.1.1" }, "tornado": { "hashes": [ diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 3ec3e64a0..f7daa758e 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -13,6 +13,7 @@ from humanfriendly import format_size from imap_tools import MailAttachment from imap_tools import MailMessage from tika_client import TikaClient +from tika_client.data_models import TikaKey from documents.parsers import DocumentParser from documents.parsers import ParseError @@ -172,8 +173,12 @@ class MailDocumentParser(DocumentParser): with TikaClient(tika_url=self.tika_server) as client: parsed = client.tika.as_text.from_buffer(html, "text/html") - if "X-TIKA:content" in parsed.data: - return parsed.data["X-TIKA:content"].strip() + if hasattr(parsed, "content") and parsed.content is not None: + return parsed.content.strip() + elif TikaKey.Content in parsed.data: + # May not be a completely handled type, but + # the Tika response may still include content + return parsed.data[TikaKey.Content].strip() return "" except Exception as err: raise ParseError( diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index 10447ff53..8b476bfd8 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -4,6 +4,7 @@ from pathlib import Path import httpx from django.conf import settings from tika_client import TikaClient +from tika_client.data_models import TikaKey from documents.parsers import DocumentParser from documents.parsers import ParseError @@ -58,8 +59,15 @@ class TikaDocumentParser(DocumentParser): f"{settings.TIKA_ENDPOINT}: {err}", ) from err - self.text = parsed.content.strip() - self.date = parsed.metadata.created + self.text = None + if hasattr(parsed, "content") and parsed.content is not None: + self.text = parsed.content.strip() + elif TikaKey.Content in parsed.data: + # May not be a completely handled type, but + # the Tika response may still include content + self.text = parsed.data[TikaKey.Content].strip() + + self.date = parsed.created self.archive_path = self.convert_to_pdf(document_path, file_name) def convert_to_pdf(self, document_path, file_name): diff --git a/src/paperless_tika/tests/samples/sample.doc b/src/paperless_tika/tests/samples/sample.doc new file mode 100644 index 000000000..72178a7be Binary files /dev/null and b/src/paperless_tika/tests/samples/sample.doc differ diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py index 9a83614b1..f4c8e0134 100644 --- a/src/paperless_tika/tests/test_live_tika.py +++ b/src/paperless_tika/tests/test_live_tika.py @@ -118,3 +118,28 @@ class TestTikaParserAgainstServer(TestCase): self.assertTrue(b"PDF-" in f.read()[:10]) # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) + + def test_basic_parse_doc(self): + """ + GIVEN: + - An input DOC format document + WHEN: + - The document is parsed + THEN: + - Document content is correct + - Document date is correct + """ + test_file = self.SAMPLE_DIR / "sample.doc" + + self.try_parse_with_wait( + test_file, + "application/msword", + ) + + self.assertIn( + "his is a test document, saved in the older .doc format", + self.parser.text, + ) + self.assertIsNotNone(self.parser.archive_path) + with open(self.parser.archive_path, "rb") as f: + self.assertTrue(b"PDF-" in f.read()[:10])