Adds better error handling/checking around getting content of a document via Tika

Signed-off-by: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
2026-01-02 14:28:14 -06:00 · 2023-06-18 07:04:53 -07:00
parent 4693632c7d
commit 4782b4da07
5 changed files with 45 additions and 7 deletions
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1746,11 +1746,11 @@
        },
        "tika-client": {
            "hashes": [
-                "sha256:43b53816b3783c9c77e16df314cad5ad66ab606391c26ad4bc94a784d473a156",
+                "sha256:29b702d64bbbaa324a75f99062efb3253239762cbf0a3419a47549c2de9379d0",
-                "sha256:e1ef3447b4307059e4a836e3786088498637323733f83a2f807b77f998d77610"
+                "sha256:63a93593068dc0da07108dc47c12cd3ff00f07403cff72c86bea6a89abafbf6d"
            ],
            "index": "pypi",
-            "version": "==0.0.3"
+            "version": "==0.1.1"
        },
        "tornado": {
            "hashes": [
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -13,6 +13,7 @@ from humanfriendly import format_size
 from imap_tools import MailAttachment
 from imap_tools import MailMessage
 from tika_client import TikaClient
 from tika_client.data_models import TikaKey
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
@@ -172,8 +173,12 @@ class MailDocumentParser(DocumentParser):
            with TikaClient(tika_url=self.tika_server) as client:
                parsed = client.tika.as_text.from_buffer(html, "text/html")
-                if "X-TIKA:content" in parsed.data:
+                if hasattr(parsed, "content") and parsed.content is not None:
-                    return parsed.data["X-TIKA:content"].strip()
+                    return parsed.content.strip()
                elif TikaKey.Content in parsed.data:
                    # May not be a completely handled type, but
                    # the Tika response may still include content
                    return parsed.data[TikaKey.Content].strip()
                return ""
        except Exception as err:
            raise ParseError(
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -4,6 +4,7 @@ from pathlib import Path
 import httpx
 from django.conf import settings
 from tika_client import TikaClient
 from tika_client.data_models import TikaKey
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
@@ -58,8 +59,15 @@ class TikaDocumentParser(DocumentParser):
                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err
-        self.text = parsed.content.strip()
+        self.text = None
-        self.date = parsed.metadata.created
+        if hasattr(parsed, "content") and parsed.content is not None:
            self.text = parsed.content.strip()
        elif TikaKey.Content in parsed.data:
            # May not be a completely handled type, but
            # the Tika response may still include content
            self.text = parsed.data[TikaKey.Content].strip()
        self.date = parsed.created
        self.archive_path = self.convert_to_pdf(document_path, file_name)
    def convert_to_pdf(self, document_path, file_name):
--- a/src/paperless_tika/tests/samples/sample.doc
+++ b/src/paperless_tika/tests/samples/sample.doc
--- a/src/paperless_tika/tests/test_live_tika.py
+++ b/src/paperless_tika/tests/test_live_tika.py
@@ -118,3 +118,28 @@ class TestTikaParserAgainstServer(TestCase):
            self.assertTrue(b"PDF-" in f.read()[:10])
        # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
    def test_basic_parse_doc(self):
        """
        GIVEN:
            - An input DOC format document
        WHEN:
            - The document is parsed
        THEN:
            - Document content is correct
            - Document date is correct
        """
        test_file = self.SAMPLE_DIR / "sample.doc"
        self.try_parse_with_wait(
            test_file,
            "application/msword",
        )
        self.assertIn(
            "his is a test document, saved in the older .doc format",
            self.parser.text,
        )
        self.assertIsNotNone(self.parser.archive_path)
        with open(self.parser.archive_path, "rb") as f:
            self.assertTrue(b"PDF-" in f.read()[:10])