mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-15 10:13:15 -05:00
Enables some basic live testing against a tika server with actual sample documents to catch some more errors mocking won't catch
This commit is contained in:
parent
5357775d42
commit
9c0c734b34
15
.github/workflows/ci.yml
vendored
15
.github/workflows/ci.yml
vendored
@ -82,6 +82,17 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ['3.8', '3.9', '3.10']
|
python-version: ['3.8', '3.9', '3.10']
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
|
services:
|
||||||
|
tika:
|
||||||
|
image: ghcr.io/paperless-ngx/tika:latest
|
||||||
|
ports:
|
||||||
|
- "9998:9998/tcp"
|
||||||
|
gotenberg:
|
||||||
|
image: docker.io/gotenberg/gotenberg:7.4
|
||||||
|
ports:
|
||||||
|
- "3000:3000/tcp"
|
||||||
|
env:
|
||||||
|
TIKA_LIVE: 1
|
||||||
steps:
|
steps:
|
||||||
-
|
-
|
||||||
name: Checkout
|
name: Checkout
|
||||||
@ -91,7 +102,7 @@ jobs:
|
|||||||
-
|
-
|
||||||
name: Install pipenv
|
name: Install pipenv
|
||||||
run: |
|
run: |
|
||||||
pipx install pipenv==2022.8.5
|
pipx install pipenv
|
||||||
pipenv --version
|
pipenv --version
|
||||||
-
|
-
|
||||||
name: Set up Python
|
name: Set up Python
|
||||||
@ -117,7 +128,7 @@ jobs:
|
|||||||
name: Tests
|
name: Tests
|
||||||
run: |
|
run: |
|
||||||
cd src/
|
cd src/
|
||||||
pipenv run pytest
|
pipenv run pytest -rfEp
|
||||||
-
|
-
|
||||||
name: Get changed files
|
name: Get changed files
|
||||||
id: changed-files-specific
|
id: changed-files-specific
|
||||||
|
BIN
src/paperless_tika/tests/samples/sample.docx
Normal file
BIN
src/paperless_tika/tests/samples/sample.docx
Normal file
Binary file not shown.
BIN
src/paperless_tika/tests/samples/sample.odt
Normal file
BIN
src/paperless_tika/tests/samples/sample.odt
Normal file
Binary file not shown.
78
src/paperless_tika/tests/test_live_tika.py
Normal file
78
src/paperless_tika/tests/test_live_tika.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Final
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from django.test import TestCase
|
||||||
|
from paperless_tika.parsers import TikaDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
|
||||||
|
class TestTikaParserAgainstServer(TestCase):
|
||||||
|
"""
|
||||||
|
This test case tests the Tika parsing against a live tika server,
|
||||||
|
if the environment contains the correct value indicating such a server
|
||||||
|
is available.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
|
||||||
|
|
||||||
|
def setUp(self) -> None:
|
||||||
|
self.parser = TikaDocumentParser(logging_group=None)
|
||||||
|
|
||||||
|
def tearDown(self) -> None:
|
||||||
|
self.parser.cleanup()
|
||||||
|
|
||||||
|
def test_basic_parse_odt(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- An input ODT format document
|
||||||
|
WHEN:
|
||||||
|
- The document is parsed
|
||||||
|
THEN:
|
||||||
|
- Document content is correct
|
||||||
|
- Document date is correct
|
||||||
|
"""
|
||||||
|
test_file = self.SAMPLE_DIR / Path("sample.odt")
|
||||||
|
|
||||||
|
self.parser.parse(test_file, "application/vnd.oasis.opendocument.text")
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
self.parser.text,
|
||||||
|
"This is an ODT test document, created September 14, 2022",
|
||||||
|
)
|
||||||
|
self.assertIsNotNone(self.parser.archive_path)
|
||||||
|
with open(self.parser.archive_path, "rb") as f:
|
||||||
|
# PDFs begin with the bytes PDF-x.y
|
||||||
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
||||||
|
|
||||||
|
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
|
||||||
|
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
|
||||||
|
|
||||||
|
def test_basic_parse_docx(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- An input DOCX format document
|
||||||
|
WHEN:
|
||||||
|
- The document is parsed
|
||||||
|
THEN:
|
||||||
|
- Document content is correct
|
||||||
|
- Document date is correct
|
||||||
|
"""
|
||||||
|
test_file = self.SAMPLE_DIR / Path("sample.docx")
|
||||||
|
|
||||||
|
self.parser.parse(
|
||||||
|
test_file,
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
self.parser.text,
|
||||||
|
"This is an DOCX test document, also made September 14, 2022",
|
||||||
|
)
|
||||||
|
self.assertIsNotNone(self.parser.archive_path)
|
||||||
|
with open(self.parser.archive_path, "rb") as f:
|
||||||
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
||||||
|
|
||||||
|
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
|
Loading…
x
Reference in New Issue
Block a user