diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py index e1af7cf86..b5efc7cef 100644 --- a/src/paperless_tika/tests/test_live_tika.py +++ b/src/paperless_tika/tests/test_live_tika.py @@ -1,10 +1,11 @@ -import datetime import os +import time from pathlib import Path from typing import Final import pytest from django.test import TestCase +from documents.parsers import ParseError from paperless_tika.parsers import TikaDocumentParser @@ -24,6 +25,44 @@ class TestTikaParserAgainstServer(TestCase): def tearDown(self) -> None: self.parser.cleanup() + def try_parse_with_wait(self, test_file, mime_type): + """ + For whatever reason, the image started during the test pipeline likes to + segfault sometimes, when run with the exact files that usually pass. + + So, this function will retry the parsing up to 3 times, with larger backoff + periods between each attempt, in hopes the issue resolves itself during + one attempt to parse. + + This will wait the following: + - Attempt 1 - 20s following failure + - Attempt 2 - 40s following failure + - Attempt 3 - 80s following failure + + """ + succeeded = False + retry_time = 20.0 + retry_count = 0 + max_retry_count = 3 + + while retry_count < max_retry_count and not succeeded: + try: + self.parser.parse(test_file, mime_type) + + succeeded = True + except Exception as e: + print(f"{e} during try #{retry_count}", flush=True) + + retry_count = retry_count + 1 + + time.sleep(retry_time) + retry_time = retry_time * 2.0 + + self.assertTrue( + succeeded, + "Continued Tika server errors after multiple retries", + ) + def test_basic_parse_odt(self): """ GIVEN: @@ -36,7 +75,7 @@ class TestTikaParserAgainstServer(TestCase): """ test_file = self.SAMPLE_DIR / Path("sample.odt") - self.parser.parse(test_file, "application/vnd.oasis.opendocument.text") + self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text") self.assertEqual( self.parser.text, @@ -62,7 +101,7 @@ class TestTikaParserAgainstServer(TestCase): """ test_file = self.SAMPLE_DIR / Path("sample.docx") - self.parser.parse( + self.try_parse_with_wait( test_file, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", )