Rewrites the email parsing to be more clear and concise.

Adds testing to use httpx mocked responses to stand in as a server even offline
2025-12-20 01:45:58 -06:00 · 2023-06-01 14:50:08 -07:00
parent 6e65558ea4
commit 2c1cd25be4
9 changed files with 701 additions and 823 deletions
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -1,10 +1,9 @@
 import os
 from pathlib import Path

-import dateutil.parser
 import httpx
 from django.conf import settings
-from tika import parser
+from tika_client import TikaClient

 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
@@ -29,55 +28,38 @@ class TikaDocumentParser(DocumentParser):
        )

    def extract_metadata(self, document_path, mime_type):
-        tika_server = settings.TIKA_ENDPOINT
-
-        # tika does not support a PathLike, only strings
-        # ensure this is a string
-        document_path = str(document_path)
-
        try:
-            parsed = parser.from_file(document_path, tika_server)
+            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
+                parsed = client.metadata.from_file(document_path, mime_type)
+                return [
+                    {
+                        "namespace": "",
+                        "prefix": "",
+                        "key": key,
+                        "value": parsed.data[key],
+                    }
+                    for key in parsed.data
+                ]
        except Exception as e:
            self.log.warning(
                f"Error while fetching document metadata for {document_path}: {e}",
            )
            return []

-        return [
-            {
-                "namespace": "",
-                "prefix": "",
-                "key": key,
-                "value": parsed["metadata"][key],
-            }
-            for key in parsed["metadata"]
-        ]
-
-    def parse(self, document_path: Path, mime_type, file_name=None):
+    def parse(self, document_path: Path, mime_type: str, file_name=None):
        self.log.info(f"Sending {document_path} to Tika server")
-        tika_server = settings.TIKA_ENDPOINT
-
-        # tika does not support a PathLike, only strings
-        # ensure this is a string
-        document_path = str(document_path)

        try:
-            parsed = parser.from_file(document_path, tika_server)
+            with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
+                parsed = client.tika.as_text.from_file(document_path, mime_type)
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
-                f"{tika_server}: {err}",
+                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err

-        self.text = parsed["content"].strip()
-
-        try:
-            self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
-        except Exception as e:
-            self.log.warning(
-                f"Unable to extract date for document {document_path}: {e}",
-            )
-
+        self.text = parsed.content.strip()
+        self.date = parsed.metadata.created
        self.archive_path = self.convert_to_pdf(document_path, file_name)

    def convert_to_pdf(self, document_path, file_name):
--- a/src/paperless_tika/tests/test_live_tika.py
+++ b/src/paperless_tika/tests/test_live_tika.py
@@ -9,7 +9,10 @@ from django.test import TestCase
 from paperless_tika.parsers import TikaDocumentParser


-@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
+@pytest.mark.skipif(
+    "PAPERLESS_CI_TEST" not in os.environ,
+    reason="No Gotenberg/Tika servers to test with",
+)
 class TestTikaParserAgainstServer(TestCase):
    """
    This test case tests the Tika parsing against a live tika server,
@@ -25,7 +28,7 @@ class TestTikaParserAgainstServer(TestCase):
    def tearDown(self) -> None:
        self.parser.cleanup()

-    def try_parse_with_wait(self, test_file, mime_type):
+    def try_parse_with_wait(self, test_file: Path, mime_type: str):
        """
        For whatever reason, the image started during the test pipeline likes to
        segfault sometimes, when run with the exact files that usually pass.
--- a/src/paperless_tika/tests/test_tika_parser.py
+++ b/src/paperless_tika/tests/test_tika_parser.py
@@ -5,34 +5,38 @@ from unittest import mock

 from django.test import TestCase
 from django.test import override_settings
-from requests import Response
+from httpx import Request
+from httpx import Response
 from rest_framework import status

 from documents.parsers import ParseError
 from paperless_tika.parsers import TikaDocumentParser
+from paperless_tika.tests.utils import HttpxMockMixin


-class TestTikaParser(TestCase):
+class TestTikaParser(HttpxMockMixin, TestCase):
    def setUp(self) -> None:
        self.parser = TikaDocumentParser(logging_group=None)

    def tearDown(self) -> None:
        self.parser.cleanup()

-    @mock.patch("paperless_tika.parsers.parser.from_file")
-    @mock.patch("paperless_tika.parsers.requests.post")
-    def test_parse(self, post, from_file):
-        from_file.return_value = {
-            "content": "the content",
-            "metadata": {"Creation-Date": "2020-11-21"},
-        }
-        response = Response()
-        response._content = b"PDF document"
-        response.status_code = status.HTTP_200_OK
-        post.return_value = response
+    def test_parse(self):
+        # Pretend parse response
+        self.httpx_mock.add_response(
+            json={
+                "Content-Type": "application/vnd.oasis.opendocument.text",
+                "X-TIKA:Parsed-By": [],
+                "X-TIKA:content": "the content",
+                "dcterms:created": "2020-11-21T00:00:00",
+            },
+        )
+        # Pretend convert to PDF response
+        self.httpx_mock.add_response(content=b"PDF document")
+
+        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
+        file.touch()

-        file = os.path.join(self.parser.tempdir, "input.odt")
-        Path(file).touch()
        self.parser.parse(file, "application/vnd.oasis.opendocument.text")

        self.assertEqual(self.parser.text, "the content")
@@ -42,26 +46,28 @@ class TestTikaParser(TestCase):

        self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21))

-    @mock.patch("paperless_tika.parsers.parser.from_file")
-    def test_metadata(self, from_file):
-        from_file.return_value = {
-            "metadata": {"Creation-Date": "2020-11-21", "Some-key": "value"},
-        }
+    def test_metadata(self):
+        self.httpx_mock.add_response(
+            json={
+                "Content-Type": "application/vnd.oasis.opendocument.text",
+                "X-TIKA:Parsed-By": [],
+                "Some-key": "value",
+                "dcterms:created": "2020-11-21T00:00:00",
+            },
+        )

-        file = os.path.join(self.parser.tempdir, "input.odt")
-        Path(file).touch()
+        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
+        file.touch()

        metadata = self.parser.extract_metadata(
            file,
            "application/vnd.oasis.opendocument.text",
        )

-        self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
+        self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
        self.assertTrue("Some-key" in [m["key"] for m in metadata])

-    @mock.patch("paperless_tika.parsers.parser.from_file")
-    @mock.patch("paperless_tika.parsers.requests.post")
-    def test_convert_failure(self, post, from_file):
+    def test_convert_failure(self):
        """
        GIVEN:
            - Document needs to be converted to PDF
@@ -70,22 +76,16 @@ class TestTikaParser(TestCase):
        THEN:
            - Parse error is raised
        """
-        from_file.return_value = {
-            "content": "the content",
-            "metadata": {"Creation-Date": "2020-11-21"},
-        }
-        response = Response()
-        response._content = b"PDF document"
-        response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
-        post.return_value = response
+        # Pretend convert to PDF response
+        self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)

-        file = os.path.join(self.parser.tempdir, "input.odt")
-        Path(file).touch()
+        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
+        file.touch()

        with self.assertRaises(ParseError):
            self.parser.convert_to_pdf(file, None)

-    @mock.patch("paperless_tika.parsers.requests.post")
+    @mock.patch("paperless_tika.parsers.httpx.post")
    def test_request_pdf_a_format(self, post: mock.Mock):
        """
        GIVEN:
@@ -95,12 +95,11 @@ class TestTikaParser(TestCase):
        THEN:
            - Request to Gotenberg contains the expected PDF/A format string
        """
-        file = os.path.join(self.parser.tempdir, "input.odt")
-        Path(file).touch()
+        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
+        file.touch()

-        response = Response()
-        response._content = b"PDF document"
-        response.status_code = status.HTTP_200_OK
+        response = Response(status_code=status.HTTP_200_OK)
+        response.request = Request("POST", "/somewhere/")
        post.return_value = response

        for setting, expected_key in [