diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a199ce7ad..6b97837c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,6 +82,17 @@ jobs: matrix: python-version: ['3.8', '3.9', '3.10'] fail-fast: false + services: + tika: + image: ghcr.io/paperless-ngx/tika:latest + ports: + - "9998:9998/tcp" + gotenberg: + image: docker.io/gotenberg/gotenberg:7.4 + ports: + - "3000:3000/tcp" + env: + TIKA_LIVE: 1 steps: - name: Checkout @@ -91,7 +102,7 @@ jobs: - name: Install pipenv run: | - pipx install pipenv==2022.8.5 + pipx install pipenv pipenv --version - name: Set up Python @@ -117,7 +128,7 @@ jobs: name: Tests run: | cd src/ - pipenv run pytest + pipenv run pytest -rfEp - name: Get changed files id: changed-files-specific diff --git a/src/paperless_tika/tests/samples/sample.docx b/src/paperless_tika/tests/samples/sample.docx new file mode 100644 index 000000000..be6f33313 Binary files /dev/null and b/src/paperless_tika/tests/samples/sample.docx differ diff --git a/src/paperless_tika/tests/samples/sample.odt b/src/paperless_tika/tests/samples/sample.odt new file mode 100644 index 000000000..f0c291aa4 Binary files /dev/null and b/src/paperless_tika/tests/samples/sample.odt differ diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py new file mode 100644 index 000000000..e1af7cf86 --- /dev/null +++ b/src/paperless_tika/tests/test_live_tika.py @@ -0,0 +1,78 @@ +import datetime +import os +from pathlib import Path +from typing import Final + +import pytest +from django.test import TestCase +from paperless_tika.parsers import TikaDocumentParser + + +@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server") +class TestTikaParserAgainstServer(TestCase): + """ + This test case tests the Tika parsing against a live tika server, + if the environment contains the correct value indicating such a server + is available. + """ + + SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve() + + def setUp(self) -> None: + self.parser = TikaDocumentParser(logging_group=None) + + def tearDown(self) -> None: + self.parser.cleanup() + + def test_basic_parse_odt(self): + """ + GIVEN: + - An input ODT format document + WHEN: + - The document is parsed + THEN: + - Document content is correct + - Document date is correct + """ + test_file = self.SAMPLE_DIR / Path("sample.odt") + + self.parser.parse(test_file, "application/vnd.oasis.opendocument.text") + + self.assertEqual( + self.parser.text, + "This is an ODT test document, created September 14, 2022", + ) + self.assertIsNotNone(self.parser.archive_path) + with open(self.parser.archive_path, "rb") as f: + # PDFs begin with the bytes PDF-x.y + self.assertTrue(b"PDF-" in f.read()[:10]) + + # TODO: Unsure what can set the Creation-Date field in a document, enable when possible + # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) + + def test_basic_parse_docx(self): + """ + GIVEN: + - An input DOCX format document + WHEN: + - The document is parsed + THEN: + - Document content is correct + - Document date is correct + """ + test_file = self.SAMPLE_DIR / Path("sample.docx") + + self.parser.parse( + test_file, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + self.assertEqual( + self.parser.text, + "This is an DOCX test document, also made September 14, 2022", + ) + self.assertIsNotNone(self.parser.archive_path) + with open(self.parser.archive_path, "rb") as f: + self.assertTrue(b"PDF-" in f.read()[:10]) + + # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))