Chore: Initial conversion to pytest fixtures (#7110)

This commit is contained in:
Trenton H
2024-07-08 07:46:20 -07:00
committed by GitHub
parent 1b9cf5121b
commit 3cf73a77ac
17 changed files with 1051 additions and 753 deletions

View File

@@ -0,0 +1,40 @@
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_tika.parsers import TikaDocumentParser
@pytest.fixture()
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
try:
parser = TikaDocumentParser(logging_group=None)
yield parser
finally:
parser.cleanup()
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture(scope="session")
def sample_odt_file(sample_dir: Path) -> Path:
return sample_dir / "sample.odt"
@pytest.fixture(scope="session")
def sample_docx_file(sample_dir: Path) -> Path:
return sample_dir / "sample.docx"
@pytest.fixture(scope="session")
def sample_doc_file(sample_dir: Path) -> Path:
return sample_dir / "sample.doc"
@pytest.fixture(scope="session")
def sample_broken_odt(sample_dir: Path) -> Path:
return sample_dir / "multi-part-broken.odt"

View File

@@ -1,9 +1,7 @@
import os
from pathlib import Path
from typing import Final
import pytest
from django.test import TestCase
from documents.tests.utils import util_call_with_backoff
from paperless_tika.parsers import TikaDocumentParser
@@ -13,22 +11,19 @@ from paperless_tika.parsers import TikaDocumentParser
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestTikaParserAgainstServer(TestCase):
@pytest.mark.django_db()
class TestTikaParserAgainstServer:
"""
This test case tests the Tika parsing against a live tika server,
if the environment contains the correct value indicating such a server
is available.
"""
SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
def setUp(self) -> None:
self.parser = TikaDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
def test_basic_parse_odt(self):
def test_basic_parse_odt(
self,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
"""
GIVEN:
- An input ODT format document
@@ -38,26 +33,26 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / Path("sample.odt")
util_call_with_backoff(
self.parser.parse,
[test_file, "application/vnd.oasis.opendocument.text"],
tika_parser.parse,
[sample_odt_file, "application/vnd.oasis.opendocument.text"],
)
self.assertEqual(
self.parser.text,
"This is an ODT test document, created September 14, 2022",
assert (
tika_parser.text
== "This is an ODT test document, created September 14, 2022"
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
# PDFs begin with the bytes PDF-x.y
self.assertTrue(b"PDF-" in f.read()[:10])
assert tika_parser.archive_path is not None
assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_docx(self):
def test_basic_parse_docx(
self,
tika_parser: TikaDocumentParser,
sample_docx_file: Path,
):
"""
GIVEN:
- An input DOCX format document
@@ -67,27 +62,29 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / Path("sample.docx")
util_call_with_backoff(
self.parser.parse,
tika_parser.parse,
[
test_file,
sample_docx_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
],
)
self.assertEqual(
self.parser.text,
"This is an DOCX test document, also made September 14, 2022",
assert (
tika_parser.text
== "This is an DOCX test document, also made September 14, 2022"
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert b"PDF-" in f.read()[:10]
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_doc(self):
def test_basic_parse_doc(
self,
tika_parser: TikaDocumentParser,
sample_doc_file: Path,
):
"""
GIVEN:
- An input DOC format document
@@ -97,22 +94,24 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / "sample.doc"
util_call_with_backoff(
self.parser.parse,
[test_file, "application/msword"],
tika_parser.parse,
[sample_doc_file, "application/msword"],
)
self.assertIn(
"his is a test document, saved in the older .doc format",
self.parser.text,
assert (
"This is a test document, saved in the older .doc format"
in tika_parser.text
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert b"PDF-" in f.read()[:10]
def test_tika_fails_multi_part(self):
def test_tika_fails_multi_part(
self,
tika_parser: TikaDocumentParser,
sample_broken_odt: Path,
):
"""
GIVEN:
- An input ODT format document
@@ -125,13 +124,11 @@ class TestTikaParserAgainstServer(TestCase):
See also:
- https://issues.apache.org/jira/browse/TIKA-4110
"""
test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
util_call_with_backoff(
self.parser.parse,
[test_file, "application/vnd.oasis.opendocument.text"],
tika_parser.parse,
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert b"PDF-" in f.read()[:10]

View File

@@ -1,30 +1,30 @@
import datetime
import os
import zoneinfo
from http import HTTPStatus
from pathlib import Path
from django.test import TestCase
from django.test import override_settings
import pytest
from httpx import codes
from httpx._multipart import DataField
from rest_framework import status
from pytest_django.fixtures import SettingsWrapper
from pytest_httpx import HTTPXMock
from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser
from paperless_tika.tests.utils import HttpxMockMixin
class TestTikaParser(HttpxMockMixin, TestCase):
def setUp(self) -> None:
self.parser = TikaDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
@override_settings(TIME_ZONE="America/Chicago")
def test_parse(self):
@pytest.mark.django_db()
class TestTikaParser:
def test_parse(
self,
httpx_mock: HTTPXMock,
settings: SettingsWrapper,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
settings.TIME_ZONE = "America/Chicago"
# Pretend parse response
self.httpx_mock.add_response(
httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
@@ -33,30 +33,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
},
)
# Pretend convert to PDF response
self.httpx_mock.add_response(content=b"PDF document")
httpx_mock.add_response(content=b"PDF document")
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
self.parser.parse(file, "application/vnd.oasis.opendocument.text")
assert tika_parser.text == "the content"
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert f.read() == b"PDF document"
self.assertEqual(self.parser.text, "the content")
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertEqual(f.read(), b"PDF document")
self.assertEqual(
self.parser.date,
datetime.datetime(
2020,
11,
21,
tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
),
assert tika_parser.date == datetime.datetime(
2020,
11,
21,
tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
)
def test_metadata(self):
self.httpx_mock.add_response(
def test_metadata(
self,
httpx_mock: HTTPXMock,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
httpx_mock.add_response(
json={
"Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [],
@@ -65,18 +64,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
},
)
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
metadata = self.parser.extract_metadata(
file,
metadata = tika_parser.extract_metadata(
sample_odt_file,
"application/vnd.oasis.opendocument.text",
)
self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
self.assertTrue("Some-key" in [m["key"] for m in metadata])
assert "dcterms:created" in [m["key"] for m in metadata]
assert "Some-key" in [m["key"] for m in metadata]
def test_convert_failure(self):
def test_convert_failure(
self,
httpx_mock: HTTPXMock,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
"""
GIVEN:
- Document needs to be converted to PDF
@@ -86,15 +87,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
- Parse error is raised
"""
# Pretend convert to PDF response
self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
with pytest.raises(ParseError):
tika_parser.convert_to_pdf(sample_odt_file, None)
with self.assertRaises(ParseError):
self.parser.convert_to_pdf(file, None)
def test_request_pdf_a_format(self):
@pytest.mark.parametrize(
("setting_value", "expected_form_value"),
[
("pdfa", "PDF/A-2b"),
("pdfa-1", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"),
("pdfa-3", "PDF/A-3b"),
],
)
def test_request_pdf_a_format(
self,
setting_value: str,
expected_form_value: str,
httpx_mock: HTTPXMock,
settings: SettingsWrapper,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
"""
GIVEN:
- Document needs to be converted to PDF
@@ -103,31 +118,21 @@ class TestTikaParser(HttpxMockMixin, TestCase):
THEN:
- Request to Gotenberg contains the expected PDF/A format string
"""
file = Path(os.path.join(self.parser.tempdir, "input.odt"))
file.touch()
settings.OCR_OUTPUT_TYPE = setting_value
httpx_mock.add_response(
status_code=codes.OK,
content=b"PDF document",
method="POST",
)
for setting, expected_key in [
("pdfa", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"),
("pdfa-1", "PDF/A-2b"),
("pdfa-3", "PDF/A-3b"),
]:
with override_settings(OCR_OUTPUT_TYPE=setting):
self.httpx_mock.add_response(
status_code=codes.OK,
content=b"PDF document",
method="POST",
)
tika_parser.convert_to_pdf(sample_odt_file, None)
self.parser.convert_to_pdf(file, None)
request = httpx_mock.get_request()
found = False
for field in request.stream.fields:
if isinstance(field, DataField) and field.name == "pdfa":
assert field.value == expected_form_value
found = True
assert found, "pdfFormat was not found"
request = self.httpx_mock.get_request()
found = False
for field in request.stream.fields:
if isinstance(field, DataField) and field.name == "pdfa":
self.assertEqual(field.value, expected_key)
found = True
break
self.assertTrue(found)
self.httpx_mock.reset(assert_all_responses_were_requested=False)
httpx_mock.reset(assert_all_responses_were_requested=False)

View File

@@ -1,11 +0,0 @@
import pytest
from pytest_httpx import HTTPXMock
class HttpxMockMixin:
@pytest.fixture(autouse=True)
def httpx_mock_auto(self, httpx_mock: HTTPXMock):
"""
Workaround for allowing use of a fixture with unittest style testing
"""
self.httpx_mock = httpx_mock