Rewrites the email parsing to be more clear and concise.

Adds testing to use httpx mocked responses to stand in as a server even offline
This commit is contained in:
Trenton H
2023-06-01 14:50:08 -07:00
parent 6e65558ea4
commit 2c1cd25be4
9 changed files with 701 additions and 823 deletions

View File

@@ -1,10 +1,9 @@
import os
from pathlib import Path
import dateutil.parser
import httpx
from django.conf import settings
from tika import parser
from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
@@ -29,55 +28,38 @@ class TikaDocumentParser(DocumentParser):
)
def extract_metadata(self, document_path, mime_type):
tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings
# ensure this is a string
document_path = str(document_path)
try:
parsed = parser.from_file(document_path, tika_server)
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.metadata.from_file(document_path, mime_type)
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed.data[key],
}
for key in parsed.data
]
except Exception as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
)
return []
return [
{
"namespace": "",
"prefix": "",
"key": key,
"value": parsed["metadata"][key],
}
for key in parsed["metadata"]
]
def parse(self, document_path: Path, mime_type, file_name=None):
def parse(self, document_path: Path, mime_type: str, file_name=None):
self.log.info(f"Sending {document_path} to Tika server")
tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings
# ensure this is a string
document_path = str(document_path)
try:
parsed = parser.from_file(document_path, tika_server)
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.tika.as_text.from_file(document_path, mime_type)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}",
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
self.text = parsed["content"].strip()
try:
self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
except Exception as e:
self.log.warning(
f"Unable to extract date for document {document_path}: {e}",
)
self.text = parsed.content.strip()
self.date = parsed.metadata.created
self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name):

View File

@@ -9,7 +9,10 @@ from django.test import TestCase
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
@pytest.mark.skipif(
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestTikaParserAgainstServer(TestCase):
"""
This test case tests the Tika parsing against a live tika server,
@@ -25,7 +28,7 @@ class TestTikaParserAgainstServer(TestCase):
def tearDown(self) -> None:
self.parser.cleanup()
def try_parse_with_wait(self, test_file, mime_type):
def try_parse_with_wait(self, test_file: Path, mime_type: str):
"""
For whatever reason, the image started during the test pipeline likes to
segfault sometimes, when run with the exact files that usually pass.

View File

@@ -5,34 +5,38 @@ from unittest import mock
from django.test import TestCase
from django.test import override_settings
from requests import Response
from httpx import Request
from httpx import Response
from rest_framework import status
from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser
from paperless_tika.tests.utils import HttpxMockMixin
class TestTikaParser(TestCase):
class TestTikaParser(HttpxMockMixin, TestCase):
def setUp(self) -> None:
self.parser = TikaDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
@mock.patch("paperless_tika.parsers.parser.from_file")
@mock.patch("paperless_tika.parsers.requests.post")
def test_parse(self, post, from_file):
from_file.return_value = {
"content": "the content",
"metadata": {"Creation-Date": "2020-11-21"},
}
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_200_OK
post.return_value = response
def test_parse(self):
# Pretend parse response
self.httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
"X-TIKA:content": "the content",
"dcterms:created": "2020-11-21T00:00:00",
},
)
# Pretend convert to PDF response
self.httpx_mock.add_response(content=b"PDF document")
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
self.parser.parse(file, "application/vnd.oasis.opendocument.text")
self.assertEqual(self.parser.text, "the content")
@@ -42,26 +46,28 @@ class TestTikaParser(TestCase):
self.assertEqual(self.parser.date, datetime.datetime(2020, 11, 21))
@mock.patch("paperless_tika.parsers.parser.from_file")
def test_metadata(self, from_file):
from_file.return_value = {
"metadata": {"Creation-Date": "2020-11-21", "Some-key": "value"},
}
def test_metadata(self):
self.httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
"Some-key": "value",
"dcterms:created": "2020-11-21T00:00:00",
},
)
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
metadata = self.parser.extract_metadata(
file,
"application/vnd.oasis.opendocument.text",
)
self.assertTrue("Creation-Date" in [m["key"] for m in metadata])
self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
self.assertTrue("Some-key" in [m["key"] for m in metadata])
@mock.patch("paperless_tika.parsers.parser.from_file")
@mock.patch("paperless_tika.parsers.requests.post")
def test_convert_failure(self, post, from_file):
def test_convert_failure(self):
"""
GIVEN:
- Document needs to be converted to PDF
@@ -70,22 +76,16 @@ class TestTikaParser(TestCase):
THEN:
- Parse error is raised
"""
from_file.return_value = {
"content": "the content",
"metadata": {"Creation-Date": "2020-11-21"},
}
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_500_INTERNAL_SERVER_ERROR
post.return_value = response
# Pretend convert to PDF response
self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
with self.assertRaises(ParseError):
self.parser.convert_to_pdf(file, None)
@mock.patch("paperless_tika.parsers.requests.post")
@mock.patch("paperless_tika.parsers.httpx.post")
def test_request_pdf_a_format(self, post: mock.Mock):
"""
GIVEN:
@@ -95,12 +95,11 @@ class TestTikaParser(TestCase):
THEN:
- Request to Gotenberg contains the expected PDF/A format string
"""
file = os.path.join(self.parser.tempdir, "input.odt")
Path(file).touch()
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
response = Response()
response._content = b"PDF document"
response.status_code = status.HTTP_200_OK
response = Response(status_code=status.HTTP_200_OK)
response.request = Request("POST", "/somewhere/")
post.return_value = response
for setting, expected_key in [