mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-21 10:29:29 -05:00
138 lines
4.2 KiB
Python
138 lines
4.2 KiB
Python
import os
|
|
from pathlib import Path
|
|
from typing import Final
|
|
|
|
import pytest
|
|
from django.test import TestCase
|
|
|
|
from documents.tests.utils import util_call_with_backoff
|
|
from paperless_tika.parsers import TikaDocumentParser
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
"PAPERLESS_CI_TEST" not in os.environ,
|
|
reason="No Gotenberg/Tika servers to test with",
|
|
)
|
|
class TestTikaParserAgainstServer(TestCase):
|
|
"""
|
|
This test case tests the Tika parsing against a live tika server,
|
|
if the environment contains the correct value indicating such a server
|
|
is available.
|
|
"""
|
|
|
|
SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
|
|
|
|
def setUp(self) -> None:
|
|
self.parser = TikaDocumentParser(logging_group=None)
|
|
|
|
def tearDown(self) -> None:
|
|
self.parser.cleanup()
|
|
|
|
def test_basic_parse_odt(self):
|
|
"""
|
|
GIVEN:
|
|
- An input ODT format document
|
|
WHEN:
|
|
- The document is parsed
|
|
THEN:
|
|
- Document content is correct
|
|
- Document date is correct
|
|
"""
|
|
test_file = self.SAMPLE_DIR / Path("sample.odt")
|
|
|
|
util_call_with_backoff(
|
|
self.parser.parse,
|
|
[test_file, "application/vnd.oasis.opendocument.text"],
|
|
)
|
|
|
|
self.assertEqual(
|
|
self.parser.text,
|
|
"This is an ODT test document, created September 14, 2022",
|
|
)
|
|
self.assertIsNotNone(self.parser.archive_path)
|
|
with open(self.parser.archive_path, "rb") as f:
|
|
# PDFs begin with the bytes PDF-x.y
|
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
|
|
|
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
|
|
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
|
|
|
|
def test_basic_parse_docx(self):
|
|
"""
|
|
GIVEN:
|
|
- An input DOCX format document
|
|
WHEN:
|
|
- The document is parsed
|
|
THEN:
|
|
- Document content is correct
|
|
- Document date is correct
|
|
"""
|
|
test_file = self.SAMPLE_DIR / Path("sample.docx")
|
|
|
|
util_call_with_backoff(
|
|
self.parser.parse,
|
|
[
|
|
test_file,
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
],
|
|
)
|
|
|
|
self.assertEqual(
|
|
self.parser.text,
|
|
"This is an DOCX test document, also made September 14, 2022",
|
|
)
|
|
self.assertIsNotNone(self.parser.archive_path)
|
|
with open(self.parser.archive_path, "rb") as f:
|
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
|
|
|
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
|
|
|
|
def test_basic_parse_doc(self):
|
|
"""
|
|
GIVEN:
|
|
- An input DOC format document
|
|
WHEN:
|
|
- The document is parsed
|
|
THEN:
|
|
- Document content is correct
|
|
- Document date is correct
|
|
"""
|
|
test_file = self.SAMPLE_DIR / "sample.doc"
|
|
|
|
util_call_with_backoff(
|
|
self.parser.parse,
|
|
[test_file, "application/msword"],
|
|
)
|
|
|
|
self.assertIn(
|
|
"his is a test document, saved in the older .doc format",
|
|
self.parser.text,
|
|
)
|
|
self.assertIsNotNone(self.parser.archive_path)
|
|
with open(self.parser.archive_path, "rb") as f:
|
|
self.assertTrue(b"PDF-" in f.read()[:10])
|
|
|
|
def test_tika_fails_multi_part(self):
|
|
"""
|
|
GIVEN:
|
|
- An input ODT format document
|
|
- The document is known to crash Tika when uploaded via multi-part form data
|
|
WHEN:
|
|
- The document is parsed
|
|
THEN:
|
|
- Document content is correct
|
|
- Document date is correct
|
|
See also:
|
|
- https://issues.apache.org/jira/browse/TIKA-4110
|
|
"""
|
|
test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
|
|
|
|
util_call_with_backoff(
|
|
self.parser.parse,
|
|
[test_file, "application/vnd.oasis.opendocument.text"],
|
|
)
|
|
|
|
self.assertIsNotNone(self.parser.archive_path)
|
|
with open(self.parser.archive_path, "rb") as f:
|
|
self.assertTrue(b"PDF-" in f.read()[:10])
|