mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-16 00:36:22 +00:00
Rewrites the email parsing to be more clear and concise.
Adds testing to use httpx mocked responses to stand in as a server even offline
This commit is contained in:
@@ -9,7 +9,10 @@ from django.test import TestCase
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
|
||||
|
||||
@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
|
||||
@pytest.mark.skipif(
|
||||
"PAPERLESS_CI_TEST" not in os.environ,
|
||||
reason="No Gotenberg/Tika servers to test with",
|
||||
)
|
||||
class TestTikaParserAgainstServer(TestCase):
|
||||
"""
|
||||
This test case tests the Tika parsing against a live tika server,
|
||||
@@ -25,7 +28,7 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
def tearDown(self) -> None:
|
||||
self.parser.cleanup()
|
||||
|
||||
def try_parse_with_wait(self, test_file, mime_type):
|
||||
def try_parse_with_wait(self, test_file: Path, mime_type: str):
|
||||
"""
|
||||
For whatever reason, the image started during the test pipeline likes to
|
||||
segfault sometimes, when run with the exact files that usually pass.
|
||||
|
@@ -5,34 +5,38 @@ from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
from requests import Response
|
||||
from httpx import Request
|
||||
from httpx import Response
|
||||
from rest_framework import status
|
||||
|
||||
from documents.parsers import ParseError
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
from paperless_tika.tests.utils import HttpxMockMixin
|
||||
|
||||
|
||||
class TestTikaParser(TestCase):
|
||||
class TestTikaParser(HttpxMockMixin, TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.parser = TikaDocumentParser(logging_group=None)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
self.parser.cleanup()
|
||||
|
||||
@mock.patch("paperless_tika.parsers.parser.from_file")
|
||||
@mock.patch("paperless_tika.parsers.requests.post")
|
||||
def test_parse(self, post, from_file):
|
||||
from_file.return_value = {
|
||||
"content": "the content",
|
||||
"metadata": {"Creation-Date": "2020-11-21"},
|
||||
}
|
||||
response = Response()
|
||||
response._content = b"PDF document"
|
||||
response.status_code = status.HTTP_200_OK
|
||||
post.return_value = response
|
||||
def test_parse(self):
|
||||
# Pretend parse response
|
||||
self.httpx_mock.add_response(
|
||||
json={
|
||||
"Content-Type": "application/vnd.oasis.opendocument.text",
|
||||
"X-TIKA:Parsed-By": [],
|
||||
"X-TIKA:content": "the content",
|
||||
"dcterms:created": "2020-11-21T00:00:00",
|
||||
},
|
||||
)
|
||||
# Pretend convert to PDF response
|
||||
self.httpx_mock.add_response(content=b"PDF document")
|
||||
|
||||
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
|
||||
file.touch()
|
||||
|
||||
file = os.path.join(self.parser.tempdir, "input.odt")
|
||||
Path(file).touch()
|
||||
self.parser.parse(file, "application/vnd.oasis.opendocument.text")
|
||||
|
||||
self.assertEqual(self.parser.text, "the content")
|
||||
@@ -42,26 +46,28 @@ class TestTikaParser(TestCase):
|
||||
|
||||
self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21))
|
||||
|
||||
@mock.patch("paperless_tika.parsers.parser.from_file")
|
||||
def test_metadata(self, from_file):
|
||||
from_file.return_value = {
|
||||
"metadata": {"Creation-Date": "2020-11-21", "Some-key": "value"},
|
||||
}
|
||||
def test_metadata(self):
|
||||
self.httpx_mock.add_response(
|
||||
json={
|
||||
"Content-Type": "application/vnd.oasis.opendocument.text",
|
||||
"X-TIKA:Parsed-By": [],
|
||||
"Some-key": "value",
|
||||
"dcterms:created": "2020-11-21T00:00:00",
|
||||
},
|
||||
)
|
||||
|
||||
file = os.path.join(self.parser.tempdir, "input.odt")
|
||||
Path(file).touch()
|
||||
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
|
||||
file.touch()
|
||||
|
||||
metadata = self.parser.extract_metadata(
|
||||
file,
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
)
|
||||
|
||||
self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
|
||||
self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
|
||||
self.assertTrue("Some-key" in [m["key"] for m in metadata])
|
||||
|
||||
@mock.patch("paperless_tika.parsers.parser.from_file")
|
||||
@mock.patch("paperless_tika.parsers.requests.post")
|
||||
def test_convert_failure(self, post, from_file):
|
||||
def test_convert_failure(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Document needs to be converted to PDF
|
||||
@@ -70,22 +76,16 @@ class TestTikaParser(TestCase):
|
||||
THEN:
|
||||
- Parse error is raised
|
||||
"""
|
||||
from_file.return_value = {
|
||||
"content": "the content",
|
||||
"metadata": {"Creation-Date": "2020-11-21"},
|
||||
}
|
||||
response = Response()
|
||||
response._content = b"PDF document"
|
||||
response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
|
||||
post.return_value = response
|
||||
# Pretend convert to PDF response
|
||||
self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
|
||||
|
||||
file = os.path.join(self.parser.tempdir, "input.odt")
|
||||
Path(file).touch()
|
||||
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
|
||||
file.touch()
|
||||
|
||||
with self.assertRaises(ParseError):
|
||||
self.parser.convert_to_pdf(file, None)
|
||||
|
||||
@mock.patch("paperless_tika.parsers.requests.post")
|
||||
@mock.patch("paperless_tika.parsers.httpx.post")
|
||||
def test_request_pdf_a_format(self, post: mock.Mock):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -95,12 +95,11 @@ class TestTikaParser(TestCase):
|
||||
THEN:
|
||||
- Request to Gotenberg contains the expected PDF/A format string
|
||||
"""
|
||||
file = os.path.join(self.parser.tempdir, "input.odt")
|
||||
Path(file).touch()
|
||||
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
|
||||
file.touch()
|
||||
|
||||
response = Response()
|
||||
response._content = b"PDF document"
|
||||
response.status_code = status.HTTP_200_OK
|
||||
response = Response(status_code=status.HTTP_200_OK)
|
||||
response.request = Request("POST", "/somewhere/")
|
||||
post.return_value = response
|
||||
|
||||
for setting, expected_key in [
|
||||
|
Reference in New Issue
Block a user