mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Combine and extend the utility for calling the live services to be more robust against failures, reporting, etc
This commit is contained in:
parent
b9fdf68be3
commit
a1697ff21c
@ -1,14 +1,21 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from collections import namedtuple
|
||||
from contextlib import contextmanager
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Callable
|
||||
from typing import Iterator
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
from unittest import mock
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
from django.apps import apps
|
||||
from django.db import connection
|
||||
from django.db.migrations.executor import MigrationExecutor
|
||||
@ -78,6 +85,61 @@ def paperless_environment():
|
||||
remove_dirs(dirs)
|
||||
|
||||
|
||||
def util_call_with_backoff(
|
||||
method_or_callable: Callable,
|
||||
args: Union[List, Tuple],
|
||||
*,
|
||||
skip_on_503=True,
|
||||
) -> Tuple[bool, Any]:
|
||||
"""
|
||||
For whatever reason, the images started during the test pipeline like to
|
||||
segfault sometimes, crash and otherwise fail randomly, when run with the
|
||||
exact files that usually pass.
|
||||
|
||||
So, this function will retry the given method/function up to 3 times, with larger backoff
|
||||
periods between each attempt, in hopes the issue resolves itself during
|
||||
one attempt to parse.
|
||||
|
||||
This will wait the following:
|
||||
- Attempt 1 - 20s following failure
|
||||
- Attempt 2 - 40s following failure
|
||||
- Attempt 3 - 80s following failure
|
||||
|
||||
"""
|
||||
result = None
|
||||
succeeded = False
|
||||
retry_time = 20.0
|
||||
retry_count = 0
|
||||
status_codes = []
|
||||
max_retry_count = 3
|
||||
|
||||
while retry_count < max_retry_count and not succeeded:
|
||||
try:
|
||||
result = method_or_callable(*args)
|
||||
|
||||
succeeded = True
|
||||
except httpx.HTTPError as exc:
|
||||
warnings.warn(f"HTTP Exception for {exc.request.url} - {exc}")
|
||||
|
||||
if isinstance(exc, httpx.HTTPStatusError):
|
||||
status_codes.append(exc.response.status_code)
|
||||
|
||||
retry_count = retry_count + 1
|
||||
|
||||
time.sleep(retry_time)
|
||||
retry_time = retry_time * 2.0
|
||||
|
||||
if (
|
||||
not succeeded
|
||||
and status_codes
|
||||
and skip_on_503
|
||||
and all(element == httpx.codes.SERVICE_UNAVAILABLE for element in status_codes)
|
||||
):
|
||||
pytest.skip("Repeated HTTP 503 for service")
|
||||
|
||||
return succeeded, result
|
||||
|
||||
|
||||
class DirectoriesMixin:
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
@ -1,5 +1,4 @@
|
||||
import os
|
||||
import time
|
||||
from unittest import mock
|
||||
|
||||
import httpx
|
||||
@ -10,6 +9,7 @@ from pdfminer.high_level import extract_text
|
||||
from PIL import Image
|
||||
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from documents.tests.utils import util_call_with_backoff
|
||||
from paperless_mail.tests.test_parsers import BaseMailParserTestCase
|
||||
|
||||
|
||||
@ -79,51 +79,6 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
|
||||
def imagehash(file, hash_size=18):
|
||||
return f"{average_hash(Image.open(file), hash_size)}"
|
||||
|
||||
def util_call_with_backoff(self, method_or_callable, args):
|
||||
"""
|
||||
For whatever reason, the image started during the test pipeline likes to
|
||||
segfault sometimes, when run with the exact files that usually pass.
|
||||
|
||||
So, this function will retry the parsing up to 3 times, with larger backoff
|
||||
periods between each attempt, in hopes the issue resolves itself during
|
||||
one attempt to parse.
|
||||
|
||||
This will wait the following:
|
||||
- Attempt 1 - 20s following failure
|
||||
- Attempt 2 - 40s following failure
|
||||
- Attempt 3 - 80s following failure
|
||||
|
||||
"""
|
||||
result = None
|
||||
succeeded = False
|
||||
retry_time = 20.0
|
||||
retry_count = 0
|
||||
max_retry_count = 3
|
||||
|
||||
while retry_count < max_retry_count and not succeeded:
|
||||
try:
|
||||
result = method_or_callable(*args)
|
||||
|
||||
succeeded = True
|
||||
except httpx.HTTPError as e:
|
||||
# Retry on HTTP errors
|
||||
print(f"{e} during try #{retry_count}", flush=True)
|
||||
|
||||
retry_count = retry_count + 1
|
||||
|
||||
time.sleep(retry_time)
|
||||
retry_time = retry_time * 2.0
|
||||
except Exception:
|
||||
# Not on other error
|
||||
raise
|
||||
|
||||
self.assertTrue(
|
||||
succeeded,
|
||||
"Continued Tika server errors after multiple retries",
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
|
||||
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
|
||||
"""
|
||||
@ -187,7 +142,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
|
||||
self.SAMPLE_DIR / "html.eml",
|
||||
)
|
||||
|
||||
pdf_path = self.util_call_with_backoff(
|
||||
_, pdf_path = util_call_with_backoff(
|
||||
self.parser.generate_pdf,
|
||||
[msg],
|
||||
)
|
||||
@ -210,7 +165,7 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
|
||||
- gotenberg is called and the resulting file is returned and look as expected.
|
||||
"""
|
||||
|
||||
self.util_call_with_backoff(
|
||||
util_call_with_backoff(
|
||||
self.parser.parse,
|
||||
[self.SAMPLE_DIR / "html.eml", "message/rfc822"],
|
||||
)
|
||||
|
@ -1,11 +1,11 @@
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.tests.utils import util_call_with_backoff
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
|
||||
|
||||
@ -28,44 +28,6 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
def tearDown(self) -> None:
|
||||
self.parser.cleanup()
|
||||
|
||||
def try_parse_with_wait(self, test_file: Path, mime_type: str):
|
||||
"""
|
||||
For whatever reason, the image started during the test pipeline likes to
|
||||
segfault sometimes, when run with the exact files that usually pass.
|
||||
|
||||
So, this function will retry the parsing up to 3 times, with larger backoff
|
||||
periods between each attempt, in hopes the issue resolves itself during
|
||||
one attempt to parse.
|
||||
|
||||
This will wait the following:
|
||||
- Attempt 1 - 20s following failure
|
||||
- Attempt 2 - 40s following failure
|
||||
- Attempt 3 - 80s following failure
|
||||
|
||||
"""
|
||||
succeeded = False
|
||||
retry_time = 20.0
|
||||
retry_count = 0
|
||||
max_retry_count = 3
|
||||
|
||||
while retry_count < max_retry_count and not succeeded:
|
||||
try:
|
||||
self.parser.parse(test_file, mime_type)
|
||||
|
||||
succeeded = True
|
||||
except Exception as e:
|
||||
print(f"{e} during try #{retry_count}", flush=True)
|
||||
|
||||
retry_count = retry_count + 1
|
||||
|
||||
time.sleep(retry_time)
|
||||
retry_time = retry_time * 2.0
|
||||
|
||||
self.assertTrue(
|
||||
succeeded,
|
||||
"Continued Tika server errors after multiple retries",
|
||||
)
|
||||
|
||||
def test_basic_parse_odt(self):
|
||||
"""
|
||||
GIVEN:
|
||||
@ -78,7 +40,10 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
"""
|
||||
test_file = self.SAMPLE_DIR / Path("sample.odt")
|
||||
|
||||
self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text")
|
||||
util_call_with_backoff(
|
||||
self.parser.parse,
|
||||
[test_file, "application/vnd.oasis.opendocument.text"],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.parser.text,
|
||||
@ -104,9 +69,12 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
"""
|
||||
test_file = self.SAMPLE_DIR / Path("sample.docx")
|
||||
|
||||
self.try_parse_with_wait(
|
||||
test_file,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
util_call_with_backoff(
|
||||
self.parser.parse,
|
||||
[
|
||||
test_file,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
],
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@ -131,9 +99,9 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
"""
|
||||
test_file = self.SAMPLE_DIR / "sample.doc"
|
||||
|
||||
self.try_parse_with_wait(
|
||||
test_file,
|
||||
"application/msword",
|
||||
util_call_with_backoff(
|
||||
self.parser.parse,
|
||||
[test_file, "application/msword"],
|
||||
)
|
||||
|
||||
self.assertIn(
|
||||
|
Loading…
x
Reference in New Issue
Block a user