Rewrites the email parsing to be more clear and concise.

Adds testing to use httpx mocked responses to stand in as a server even offline
This commit is contained in:
Trenton H
2023-06-01 14:50:08 -07:00
parent 6e65558ea4
commit 2c1cd25be4
9 changed files with 701 additions and 823 deletions

View File

@@ -9,7 +9,10 @@ from django.test import TestCase
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
@pytest.mark.skipif(
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestTikaParserAgainstServer(TestCase):
"""
This test case tests the Tika parsing against a live tika server,
@@ -25,7 +28,7 @@ class TestTikaParserAgainstServer(TestCase):
def tearDown(self) -> None:
self.parser.cleanup()
def try_parse_with_wait(self, test_file, mime_type):
def try_parse_with_wait(self, test_file: Path, mime_type: str):
"""
For whatever reason, the image started during the test pipeline likes to
segfault sometimes, when run with the exact files that usually pass.

View File

@@ -5,34 +5,38 @@ from unittest import mock
from django.test import TestCase
from django.test import override_settings
from requests import Response
from httpx import Request
from httpx import Response
from rest_framework import status
from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser
from paperless_tika.tests.utils import HttpxMockMixin
class TestTikaParser(TestCase):
class TestTikaParser(HttpxMockMixin, TestCase):
def setUp(self) -> None:
self.parser = TikaDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
@mock.patch("paperless_tika.parsers.parser.from_file")
@mock.patch("paperless_tika.parsers.requests.post")
def test_parse(self, post, from_file):
from_file.return_value = {
"content": "the content",
"metadata": {"Creation-Date": "2020-11-21"},
}
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_200_OK
post.return_value = response
def test_parse(self):
# Pretend parse response
self.httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": "the content",
"dcterms:created": "2020-11-21T00:00:00",
},
)
# Pretend convert to PDF response
self.httpx_mock.add_response(content=b"PDF document")
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
self.parser.parse(file, "application/vnd.oasis.opendocument.text")
self.assertEqual(self.parser.text, "the content")
@@ -42,26 +46,28 @@ class TestTikaParser(TestCase):
self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21))
@mock.patch("paperless_tika.parsers.parser.from_file")
def test_metadata(self, from_file):
from_file.return_value = {
"metadata": {"Creation-Date": "2020-11-21", "Some-key": "value"},
}
def test_metadata(self):
self.httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
"Some-key": "value",
"dcterms:created": "2020-11-21T00:00:00",
},
)
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
metadata = self.parser.extract_metadata(
file,
"application/vnd.oasis.opendocument.text",
)
self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
self.assertTrue("Some-key" in [m["key"] for m in metadata])
@mock.patch("paperless_tika.parsers.parser.from_file")
@mock.patch("paperless_tika.parsers.requests.post")
def test_convert_failure(self, post, from_file):
def test_convert_failure(self):
"""
GIVEN:
- Document needs to be converted to PDF
@@ -70,22 +76,16 @@ class TestTikaParser(TestCase):
THEN:
- Parse error is raised
"""
from_file.return_value = {
"content": "the content",
"metadata": {"Creation-Date": "2020-11-21"},
}
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
post.return_value = response
# Pretend convert to PDF response
self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
with self.assertRaises(ParseError):
self.parser.convert_to_pdf(file, None)
@mock.patch("paperless_tika.parsers.requests.post")
@mock.patch("paperless_tika.parsers.httpx.post")
def test_request_pdf_a_format(self, post: mock.Mock):
"""
GIVEN:
@@ -95,12 +95,11 @@ class TestTikaParser(TestCase):
THEN:
- Request to Gotenberg contains the expected PDF/A format string
"""
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_200_OK
response = Response(status_code=status.HTTP_200_OK)
response.request = Request("POST", "/somewhere/")
post.return_value = response
for setting, expected_key in [