Enables some basic live testing against a tika server with actual sample documents to catch some more errors mocking won't catch

This commit is contained in:
Trenton Holmes 2022-09-14 08:39:08 -07:00 committed by Trenton H
parent 5357775d42
commit 9c0c734b34
4 changed files with 91 additions and 2 deletions

View File

@ -82,6 +82,17 @@ jobs:
matrix:
python-version: ['3.8', '3.9', '3.10']
fail-fast: false
services:
tika:
image: ghcr.io/paperless-ngx/tika:latest
ports:
- "9998:9998/tcp"
gotenberg:
image: docker.io/gotenberg/gotenberg:7.4
ports:
- "3000:3000/tcp"
env:
TIKA_LIVE: 1
steps:
-
name: Checkout
@ -91,7 +102,7 @@ jobs:
-
name: Install pipenv
run: |
pipx install pipenv==2022.8.5
pipx install pipenv
pipenv --version
-
name: Set up Python
@ -117,7 +128,7 @@ jobs:
name: Tests
run: |
cd src/
pipenv run pytest
pipenv run pytest -rfEp
-
name: Get changed files
id: changed-files-specific

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,78 @@
import datetime
import os
from pathlib import Path
from typing import Final
import pytest
from django.test import TestCase
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
class TestTikaParserAgainstServer(TestCase):
"""
This test case tests the Tika parsing against a live tika server,
if the environment contains the correct value indicating such a server
is available.
"""
SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
def setUp(self) -> None:
self.parser = TikaDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
def test_basic_parse_odt(self):
"""
GIVEN:
- An input ODT format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / Path("sample.odt")
self.parser.parse(test_file, "application/vnd.oasis.opendocument.text")
self.assertEqual(
self.parser.text,
"This is an ODT test document, created September 14, 2022",
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
# PDFs begin with the bytes PDF-x.y
self.assertTrue(b"PDF-" in f.read()[:10])
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_docx(self):
"""
GIVEN:
- An input DOCX format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / Path("sample.docx")
self.parser.parse(
test_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
self.assertEqual(
self.parser.text,
"This is an DOCX test document, also made September 14, 2022",
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))