From a1697ff21c4d8eb58c72762f87045d3fcaaf1aa4 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:25:20 -0700 Subject: [PATCH] Combine and extend the utility for calling the live services to be more robust against failures, reporting, etc --- src/documents/tests/utils.py | 62 +++++++++++++++++++ src/paperless_mail/tests/test_parsers_live.py | 51 +-------------- src/paperless_tika/tests/test_live_tika.py | 60 +++++------------- 3 files changed, 79 insertions(+), 94 deletions(-) diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index 483d3b12d..c679a8f02 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -1,14 +1,21 @@ import shutil import tempfile +import time +import warnings from collections import namedtuple from contextlib import contextmanager from os import PathLike from pathlib import Path +from typing import Any +from typing import Callable from typing import Iterator +from typing import List from typing import Tuple from typing import Union from unittest import mock +import httpx +import pytest from django.apps import apps from django.db import connection from django.db.migrations.executor import MigrationExecutor @@ -78,6 +85,61 @@ def paperless_environment(): remove_dirs(dirs) +def util_call_with_backoff( + method_or_callable: Callable, + args: Union[List, Tuple], + *, + skip_on_503=True, +) -> Tuple[bool, Any]: + """ + For whatever reason, the images started during the test pipeline like to + segfault sometimes, crash and otherwise fail randomly, when run with the + exact files that usually pass. + + So, this function will retry the given method/function up to 3 times, with larger backoff + periods between each attempt, in hopes the issue resolves itself during + one attempt to parse. + + This will wait the following: + - Attempt 1 - 20s following failure + - Attempt 2 - 40s following failure + - Attempt 3 - 80s following failure + + """ + result = None + succeeded = False + retry_time = 20.0 + retry_count = 0 + status_codes = [] + max_retry_count = 3 + + while retry_count < max_retry_count and not succeeded: + try: + result = method_or_callable(*args) + + succeeded = True + except httpx.HTTPError as exc: + warnings.warn(f"HTTP Exception for {exc.request.url} - {exc}") + + if isinstance(exc, httpx.HTTPStatusError): + status_codes.append(exc.response.status_code) + + retry_count = retry_count + 1 + + time.sleep(retry_time) + retry_time = retry_time * 2.0 + + if ( + not succeeded + and status_codes + and skip_on_503 + and all(element == httpx.codes.SERVICE_UNAVAILABLE for element in status_codes) + ): + pytest.skip("Repeated HTTP 503 for service") + + return succeeded, result + + class DirectoriesMixin: def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/src/paperless_mail/tests/test_parsers_live.py b/src/paperless_mail/tests/test_parsers_live.py index 208383b15..c58c1dfbc 100644 --- a/src/paperless_mail/tests/test_parsers_live.py +++ b/src/paperless_mail/tests/test_parsers_live.py @@ -1,5 +1,4 @@ import os -import time from unittest import mock import httpx @@ -10,6 +9,7 @@ from pdfminer.high_level import extract_text from PIL import Image from documents.tests.utils import FileSystemAssertsMixin +from documents.tests.utils import util_call_with_backoff from paperless_mail.tests.test_parsers import BaseMailParserTestCase @@ -79,51 +79,6 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): def imagehash(file, hash_size=18): return f"{average_hash(Image.open(file), hash_size)}" - def util_call_with_backoff(self, method_or_callable, args): - """ - For whatever reason, the image started during the test pipeline likes to - segfault sometimes, when run with the exact files that usually pass. - - So, this function will retry the parsing up to 3 times, with larger backoff - periods between each attempt, in hopes the issue resolves itself during - one attempt to parse. - - This will wait the following: - - Attempt 1 - 20s following failure - - Attempt 2 - 40s following failure - - Attempt 3 - 80s following failure - - """ - result = None - succeeded = False - retry_time = 20.0 - retry_count = 0 - max_retry_count = 3 - - while retry_count < max_retry_count and not succeeded: - try: - result = method_or_callable(*args) - - succeeded = True - except httpx.HTTPError as e: - # Retry on HTTP errors - print(f"{e} during try #{retry_count}", flush=True) - - retry_count = retry_count + 1 - - time.sleep(retry_time) - retry_time = retry_time * 2.0 - except Exception: - # Not on other error - raise - - self.assertTrue( - succeeded, - "Continued Tika server errors after multiple retries", - ) - - return result - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock): """ @@ -187,7 +142,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): self.SAMPLE_DIR / "html.eml", ) - pdf_path = self.util_call_with_backoff( + _, pdf_path = util_call_with_backoff( self.parser.generate_pdf, [msg], ) @@ -210,7 +165,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): - gotenberg is called and the resulting file is returned and look as expected. """ - self.util_call_with_backoff( + util_call_with_backoff( self.parser.parse, [self.SAMPLE_DIR / "html.eml", "message/rfc822"], ) diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py index f4c8e0134..f34278467 100644 --- a/src/paperless_tika/tests/test_live_tika.py +++ b/src/paperless_tika/tests/test_live_tika.py @@ -1,11 +1,11 @@ import os -import time from pathlib import Path from typing import Final import pytest from django.test import TestCase +from documents.tests.utils import util_call_with_backoff from paperless_tika.parsers import TikaDocumentParser @@ -28,44 +28,6 @@ class TestTikaParserAgainstServer(TestCase): def tearDown(self) -> None: self.parser.cleanup() - def try_parse_with_wait(self, test_file: Path, mime_type: str): - """ - For whatever reason, the image started during the test pipeline likes to - segfault sometimes, when run with the exact files that usually pass. - - So, this function will retry the parsing up to 3 times, with larger backoff - periods between each attempt, in hopes the issue resolves itself during - one attempt to parse. - - This will wait the following: - - Attempt 1 - 20s following failure - - Attempt 2 - 40s following failure - - Attempt 3 - 80s following failure - - """ - succeeded = False - retry_time = 20.0 - retry_count = 0 - max_retry_count = 3 - - while retry_count < max_retry_count and not succeeded: - try: - self.parser.parse(test_file, mime_type) - - succeeded = True - except Exception as e: - print(f"{e} during try #{retry_count}", flush=True) - - retry_count = retry_count + 1 - - time.sleep(retry_time) - retry_time = retry_time * 2.0 - - self.assertTrue( - succeeded, - "Continued Tika server errors after multiple retries", - ) - def test_basic_parse_odt(self): """ GIVEN: @@ -78,7 +40,10 @@ class TestTikaParserAgainstServer(TestCase): """ test_file = self.SAMPLE_DIR / Path("sample.odt") - self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text") + util_call_with_backoff( + self.parser.parse, + [test_file, "application/vnd.oasis.opendocument.text"], + ) self.assertEqual( self.parser.text, @@ -104,9 +69,12 @@ class TestTikaParserAgainstServer(TestCase): """ test_file = self.SAMPLE_DIR / Path("sample.docx") - self.try_parse_with_wait( - test_file, - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + util_call_with_backoff( + self.parser.parse, + [ + test_file, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ], ) self.assertEqual( @@ -131,9 +99,9 @@ class TestTikaParserAgainstServer(TestCase): """ test_file = self.SAMPLE_DIR / "sample.doc" - self.try_parse_with_wait( - test_file, - "application/msword", + util_call_with_backoff( + self.parser.parse, + [test_file, "application/msword"], ) self.assertIn(