Adds better error handling/checking around getting content of a document via Tika

Signed-off-by: Trenton Holmes <797416+stumpylog@users.noreply.github.com>
2025-12-18 01:41:14 -06:00 · 2023-06-18 07:04:53 -07:00
parent 204149afb5
commit 48ab961c68
5 changed files with 45 additions and 7 deletions
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -4,6 +4,7 @@ from pathlib import Path
 import httpx
 from django.conf import settings
 from tika_client import TikaClient
+from tika_client.data_models import TikaKey

 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
@@ -58,8 +59,15 @@ class TikaDocumentParser(DocumentParser):
                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err

-        self.text = parsed.content.strip()
-        self.date = parsed.metadata.created
+        self.text = None
+        if hasattr(parsed, "content") and parsed.content is not None:
+            self.text = parsed.content.strip()
+        elif TikaKey.Content in parsed.data:
+            # May not be a completely handled type, but
+            # the Tika response may still include content
+            self.text = parsed.data[TikaKey.Content].strip()
+
+        self.date = parsed.created
        self.archive_path = self.convert_to_pdf(document_path, file_name)

    def convert_to_pdf(self, document_path, file_name):