Combine and extend the utility for calling the live services to be more robust against failures, reporting, etc

This commit is contained in:
Trenton H 2023-08-29 10:25:20 -07:00
parent b9fdf68be3
commit a1697ff21c
3 changed files with 79 additions and 94 deletions

View File

@ -1,14 +1,21 @@
import shutil import shutil
import tempfile import tempfile
import time
import warnings
from collections import namedtuple from collections import namedtuple
from contextlib import contextmanager from contextlib import contextmanager
from os import PathLike from os import PathLike
from pathlib import Path from pathlib import Path
from typing import Any
from typing import Callable
from typing import Iterator from typing import Iterator
from typing import List
from typing import Tuple from typing import Tuple
from typing import Union from typing import Union
from unittest import mock from unittest import mock
import httpx
import pytest
from django.apps import apps from django.apps import apps
from django.db import connection from django.db import connection
from django.db.migrations.executor import MigrationExecutor from django.db.migrations.executor import MigrationExecutor
@ -78,6 +85,61 @@ def paperless_environment():
remove_dirs(dirs) remove_dirs(dirs)
def util_call_with_backoff(
method_or_callable: Callable,
args: Union[List, Tuple],
*,
skip_on_503=True,
) -> Tuple[bool, Any]:
"""
For whatever reason, the images started during the test pipeline like to
segfault sometimes, crash and otherwise fail randomly, when run with the
exact files that usually pass.
So, this function will retry the given method/function up to 3 times, with larger backoff
periods between each attempt, in hopes the issue resolves itself during
one attempt to parse.
This will wait the following:
- Attempt 1 - 20s following failure
- Attempt 2 - 40s following failure
- Attempt 3 - 80s following failure
"""
result = None
succeeded = False
retry_time = 20.0
retry_count = 0
status_codes = []
max_retry_count = 3
while retry_count < max_retry_count and not succeeded:
try:
result = method_or_callable(*args)
succeeded = True
except httpx.HTTPError as exc:
warnings.warn(f"HTTP Exception for {exc.request.url} - {exc}")
if isinstance(exc, httpx.HTTPStatusError):
status_codes.append(exc.response.status_code)
retry_count = retry_count + 1
time.sleep(retry_time)
retry_time = retry_time * 2.0
if (
not succeeded
and status_codes
and skip_on_503
and all(element == httpx.codes.SERVICE_UNAVAILABLE for element in status_codes)
):
pytest.skip("Repeated HTTP 503 for service")
return succeeded, result
class DirectoriesMixin: class DirectoriesMixin:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)

View File

@ -1,5 +1,4 @@
import os import os
import time
from unittest import mock from unittest import mock
import httpx import httpx
@ -10,6 +9,7 @@ from pdfminer.high_level import extract_text
from PIL import Image from PIL import Image
from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import util_call_with_backoff
from paperless_mail.tests.test_parsers import BaseMailParserTestCase from paperless_mail.tests.test_parsers import BaseMailParserTestCase
@ -79,51 +79,6 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
def imagehash(file, hash_size=18): def imagehash(file, hash_size=18):
return f"{average_hash(Image.open(file), hash_size)}" return f"{average_hash(Image.open(file), hash_size)}"
def util_call_with_backoff(self, method_or_callable, args):
"""
For whatever reason, the image started during the test pipeline likes to
segfault sometimes, when run with the exact files that usually pass.
So, this function will retry the parsing up to 3 times, with larger backoff
periods between each attempt, in hopes the issue resolves itself during
one attempt to parse.
This will wait the following:
- Attempt 1 - 20s following failure
- Attempt 2 - 40s following failure
- Attempt 3 - 80s following failure
"""
result = None
succeeded = False
retry_time = 20.0
retry_count = 0
max_retry_count = 3
while retry_count < max_retry_count and not succeeded:
try:
result = method_or_callable(*args)
succeeded = True
except httpx.HTTPError as e:
# Retry on HTTP errors
print(f"{e} during try #{retry_count}", flush=True)
retry_count = retry_count + 1
time.sleep(retry_time)
retry_time = retry_time * 2.0
except Exception:
# Not on other error
raise
self.assertTrue(
succeeded,
"Continued Tika server errors after multiple retries",
)
return result
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock): def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
""" """
@ -187,7 +142,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
self.SAMPLE_DIR / "html.eml", self.SAMPLE_DIR / "html.eml",
) )
pdf_path = self.util_call_with_backoff( _, pdf_path = util_call_with_backoff(
self.parser.generate_pdf, self.parser.generate_pdf,
[msg], [msg],
) )
@ -210,7 +165,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
- gotenberg is called and the resulting file is returned and look as expected. - gotenberg is called and the resulting file is returned and look as expected.
""" """
self.util_call_with_backoff( util_call_with_backoff(
self.parser.parse, self.parser.parse,
[self.SAMPLE_DIR / "html.eml", "message/rfc822"], [self.SAMPLE_DIR / "html.eml", "message/rfc822"],
) )

View File

@ -1,11 +1,11 @@
import os import os
import time
from pathlib import Path from pathlib import Path
from typing import Final from typing import Final
import pytest import pytest
from django.test import TestCase from django.test import TestCase
from documents.tests.utils import util_call_with_backoff
from paperless_tika.parsers import TikaDocumentParser from paperless_tika.parsers import TikaDocumentParser
@ -28,44 +28,6 @@ class TestTikaParserAgainstServer(TestCase):
def tearDown(self) -> None: def tearDown(self) -> None:
self.parser.cleanup() self.parser.cleanup()
def try_parse_with_wait(self, test_file: Path, mime_type: str):
"""
For whatever reason, the image started during the test pipeline likes to
segfault sometimes, when run with the exact files that usually pass.
So, this function will retry the parsing up to 3 times, with larger backoff
periods between each attempt, in hopes the issue resolves itself during
one attempt to parse.
This will wait the following:
- Attempt 1 - 20s following failure
- Attempt 2 - 40s following failure
- Attempt 3 - 80s following failure
"""
succeeded = False
retry_time = 20.0
retry_count = 0
max_retry_count = 3
while retry_count < max_retry_count and not succeeded:
try:
self.parser.parse(test_file, mime_type)
succeeded = True
except Exception as e:
print(f"{e} during try #{retry_count}", flush=True)
retry_count = retry_count + 1
time.sleep(retry_time)
retry_time = retry_time * 2.0
self.assertTrue(
succeeded,
"Continued Tika server errors after multiple retries",
)
def test_basic_parse_odt(self): def test_basic_parse_odt(self):
""" """
GIVEN: GIVEN:
@ -78,7 +40,10 @@ class TestTikaParserAgainstServer(TestCase):
""" """
test_file = self.SAMPLE_DIR / Path("sample.odt") test_file = self.SAMPLE_DIR / Path("sample.odt")
self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text") util_call_with_backoff(
self.parser.parse,
[test_file, "application/vnd.oasis.opendocument.text"],
)
self.assertEqual( self.assertEqual(
self.parser.text, self.parser.text,
@ -104,9 +69,12 @@ class TestTikaParserAgainstServer(TestCase):
""" """
test_file = self.SAMPLE_DIR / Path("sample.docx") test_file = self.SAMPLE_DIR / Path("sample.docx")
self.try_parse_with_wait( util_call_with_backoff(
test_file, self.parser.parse,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", [
test_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
],
) )
self.assertEqual( self.assertEqual(
@ -131,9 +99,9 @@ class TestTikaParserAgainstServer(TestCase):
""" """
test_file = self.SAMPLE_DIR / "sample.doc" test_file = self.SAMPLE_DIR / "sample.doc"
self.try_parse_with_wait( util_call_with_backoff(
test_file, self.parser.parse,
"application/msword", [test_file, "application/msword"],
) )
self.assertIn( self.assertIn(