import os import time from pathlib import Path from typing import Final import pytest from django.test import TestCase from paperless_tika.parsers import TikaDocumentParser @pytest.mark.skipif( "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) class TestTikaParserAgainstServer(TestCase): """ This test case tests the Tika parsing against a live tika server, if the environment contains the correct value indicating such a server is available. """ SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve() def setUp(self) -> None: self.parser = TikaDocumentParser(logging_group=None) def tearDown(self) -> None: self.parser.cleanup() def try_parse_with_wait(self, test_file: Path, mime_type: str): """ For whatever reason, the image started during the test pipeline likes to segfault sometimes, when run with the exact files that usually pass. So, this function will retry the parsing up to 3 times, with larger backoff periods between each attempt, in hopes the issue resolves itself during one attempt to parse. This will wait the following: - Attempt 1 - 20s following failure - Attempt 2 - 40s following failure - Attempt 3 - 80s following failure """ succeeded = False retry_time = 20.0 retry_count = 0 max_retry_count = 3 while retry_count < max_retry_count and not succeeded: try: self.parser.parse(test_file, mime_type) succeeded = True except Exception as e: print(f"{e} during try #{retry_count}", flush=True) retry_count = retry_count + 1 time.sleep(retry_time) retry_time = retry_time * 2.0 self.assertTrue( succeeded, "Continued Tika server errors after multiple retries", ) def test_basic_parse_odt(self): """ GIVEN: - An input ODT format document WHEN: - The document is parsed THEN: - Document content is correct - Document date is correct """ test_file = self.SAMPLE_DIR / Path("sample.odt") self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text") self.assertEqual( self.parser.text, "This is an ODT test document, created September 14, 2022", ) self.assertIsNotNone(self.parser.archive_path) with open(self.parser.archive_path, "rb") as f: # PDFs begin with the bytes PDF-x.y self.assertTrue(b"PDF-" in f.read()[:10]) # TODO: Unsure what can set the Creation-Date field in a document, enable when possible # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) def test_basic_parse_docx(self): """ GIVEN: - An input DOCX format document WHEN: - The document is parsed THEN: - Document content is correct - Document date is correct """ test_file = self.SAMPLE_DIR / Path("sample.docx") self.try_parse_with_wait( test_file, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) self.assertEqual( self.parser.text, "This is an DOCX test document, also made September 14, 2022", ) self.assertIsNotNone(self.parser.archive_path) with open(self.parser.archive_path, "rb") as f: self.assertTrue(b"PDF-" in f.read()[:10]) # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))