Chore: Use a local http server instead of external to reduce flakiness (#11916)

This commit is contained in:
Trenton H
2026-01-27 19:57:12 -08:00
committed by GitHub
parent 56b5d838d7
commit 01b21377af
9 changed files with 145 additions and 53 deletions

View File

@@ -34,3 +34,13 @@ services:
ports:
- "3143:3143" # IMAP
restart: unless-stopped
nginx:
image: docker.io/nginx:1.29-alpine
hostname: nginx
container_name: nginx
ports:
- "8080:8080"
restart: unless-stopped
volumes:
- ../../docs/assets:/usr/share/nginx/html/assets:ro
- ./test-nginx.conf:/etc/nginx/conf.d/default.conf:ro

View File

@@ -0,0 +1,14 @@
server {
listen 8080;
server_name localhost;
root /usr/share/nginx/html;
# Enable CORS for test requests
add_header 'Access-Control-Allow-Origin' '*' always;
add_header 'Access-Control-Allow-Methods' 'GET, HEAD, OPTIONS' always;
location / {
try_files $uri $uri/ =404;
}
}

View File

@@ -300,6 +300,14 @@ norecursedirs = [ "src/locale/", ".venv/", "src-ui/" ]
DJANGO_SETTINGS_MODULE = "paperless.settings"
markers = [
"live: Integration tests requiring external services (Gotenberg, Tika, nginx, etc)",
"nginx: Tests that make HTTP requests to the local nginx service",
"gotenberg: Tests requiring Gotenberg service",
"tika: Tests requiring Tika service",
"greenmail: Tests requiring Greenmail service",
]
[tool.pytest_env]
PAPERLESS_DISABLE_DBHANDLER = "true"
PAPERLESS_CACHE_BACKEND = "django.core.cache.backends.locmem.LocMemCache"

View File

@@ -816,7 +816,7 @@ class TestCommandWatch:
f.flush()
sleep(0.05)
sleep(0.5)
sleep(0.8)
if thread.exception:
raise thread.exception
@@ -837,7 +837,7 @@ class TestCommandWatch:
(consumption_dir / "._document.pdf").write_bytes(b"test")
shutil.copy(sample_pdf, consumption_dir / "valid.pdf")
sleep(0.5)
sleep(0.8)
if thread.exception:
raise thread.exception

View File

@@ -89,3 +89,11 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]:
@pytest.fixture()
def mail_account_handler() -> MailAccountHandler:
return MailAccountHandler()
@pytest.fixture(scope="session")
def nginx_base_url() -> Generator[str, None, None]:
"""
The base URL for the nginx HTTP server we expect to be alive
"""
yield "http://localhost:8080"

View File

@@ -55,7 +55,7 @@ Content-Transfer-Encoding: 7bit
<p>Some Text</p>
<p>
<img src="cid:part1.pNdUSz0s.D3NqVtPg@example.de" alt="Has to be rewritten to work..">
<img src="https://docs.paperless-ngx.com/assets/logo_full_white.svg" alt="This image should not be shown.">
<img src="http://localhost:8080/assets/logo_full_white.svg" alt="This image should not be shown.">
</p>
<p>and an embedded image.<br>

View File

@@ -6,7 +6,7 @@
<p>Some Text</p>
<p>
<img src="cid:part1.pNdUSz0s.D3NqVtPg@example.de" alt="Has to be rewritten to work..">
<img src="https://docs.paperless-ngx.com/assets/logo_full_white.svg" alt="This image should not be shown.">
<img src="http://localhost:8080/assets/logo_full_white.svg" alt="This image should not be shown.">
</p>
<p>and an embedded image.<br>

View File

@@ -6,6 +6,8 @@ from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
@pytest.mark.live
@pytest.mark.greenmail
@pytest.mark.django_db
class TestMailGreenmail:
"""

View File

@@ -17,7 +17,7 @@ from paperless_mail.parsers import MailDocumentParser
def extract_text(pdf_path: Path) -> str:
"""
Using pdftotext from poppler, extracts the text of a PDF into a file,
then reads the file contents and returns it
then reads the file contents and returns it.
"""
with tempfile.NamedTemporaryFile(
mode="w+",
@@ -38,71 +38,107 @@ def extract_text(pdf_path: Path) -> str:
class MailAttachmentMock:
def __init__(self, payload, content_id):
def __init__(self, payload: bytes, content_id: str) -> None:
self.payload = payload
self.content_id = content_id
self.content_type = "image/png"
@pytest.mark.live
@pytest.mark.nginx
@pytest.mark.skipif(
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestUrlCanary:
class TestNginxService:
"""
Verify certain URLs are still available so testing is valid still
Verify the local nginx server is responding correctly.
These tests validate that the test infrastructure is working properly
before running the actual parser tests that depend on HTTP resources.
"""
def test_online_image_exception_on_not_available(self):
def test_non_existent_resource_returns_404(
self,
nginx_base_url: str,
) -> None:
"""
GIVEN:
- Fresh start
- Local nginx server is running
WHEN:
- nonexistent image is requested
- A non-existent resource is requested
THEN:
- An exception shall be thrown
"""
"""
A public image is used in the html sample file. We have no control
whether this image stays online forever, so here we check if we can detect if is not
available anymore.
- An HTTP 404 status code shall be returned
"""
resp = httpx.get(
"https://docs.paperless-ngx.com/assets/non-existent.png",
f"{nginx_base_url}/assets/non-existent.png",
timeout=5.0,
)
with pytest.raises(httpx.HTTPStatusError) as exec_info:
resp.raise_for_status()
assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
def test_is_online_image_still_available(self):
def test_valid_resource_is_available(
self,
nginx_base_url: str,
) -> None:
"""
GIVEN:
- Fresh start
- Local nginx server is running
WHEN:
- A public image used in the html sample file is requested
- A valid test fixture resource is requested
THEN:
- No exception shall be thrown
- The resource shall be returned with HTTP 200 status code
- The response shall contain the expected content type
"""
"""
A public image is used in the html sample file. We have no control
whether this image stays online forever, so here we check if it is still there
"""
# Now check the URL used in samples/sample.html
resp = httpx.get(
"https://docs.paperless-ngx.com/assets/logo_full_white.svg",
f"{nginx_base_url}/assets/logo_full_white.svg",
timeout=5.0,
)
resp.raise_for_status()
assert resp.status_code == httpx.codes.OK
assert "svg" in resp.headers.get("content-type", "").lower()
def test_server_connectivity(
self,
nginx_base_url: str,
) -> None:
"""
GIVEN:
- Local test fixtures server should be running
WHEN:
- A request is made to the server root
THEN:
- The server shall respond without connection errors
"""
try:
resp = httpx.get(
nginx_base_url,
timeout=5.0,
follow_redirects=True,
)
# We don't care about the status code, just that we can connect
assert resp.status_code in {200, 404, 403}
except httpx.ConnectError as e:
pytest.fail(
f"Cannot connect to nginx server at {nginx_base_url}. "
f"Ensure the nginx container is running via docker-compose.ci-test.yml. "
f"Error: {e}",
)
@pytest.mark.live
@pytest.mark.gotenberg
@pytest.mark.tika
@pytest.mark.nginx
@pytest.mark.skipif(
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
class TestParserLive:
@staticmethod
def imagehash(file, hash_size=18):
def imagehash(file: Path, hash_size: int = 18) -> str:
return f"{average_hash(Image.open(file), hash_size)}"
def test_get_thumbnail(
@@ -112,14 +148,15 @@ class TestParserLive:
simple_txt_email_file: Path,
simple_txt_email_pdf_file: Path,
simple_txt_email_thumbnail_file: Path,
):
) -> None:
"""
GIVEN:
- Fresh start
- A simple text email file
- Mocked PDF generation returning a known PDF
WHEN:
- The Thumbnail is requested
- The thumbnail is requested
THEN:
- The returned thumbnail image file is as expected
- The returned thumbnail image file shall match the expected hash
"""
mock_generate_pdf = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
@@ -134,22 +171,28 @@ class TestParserLive:
assert self.imagehash(thumb) == self.imagehash(
simple_txt_email_thumbnail_file,
), (
f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}"
f"Created thumbnail {thumb} differs from expected file "
f"{simple_txt_email_thumbnail_file}"
)
def test_tika_parse_successful(self, mail_parser: MailDocumentParser):
def test_tika_parse_successful(self, mail_parser: MailDocumentParser) -> None:
"""
GIVEN:
- Fresh start
- HTML content to parse
- Tika server is running
WHEN:
- tika parsing is called
- Tika parsing is called
THEN:
- a web request to tika shall be done and the reply es returned
- A web request to Tika shall be made
- The parsed text content shall be returned
"""
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
html = (
'<html><head><meta http-equiv="content-type" '
'content="text/html; charset=UTF-8"></head>'
"<body><p>Some Text</p></body></html>"
)
expected_text = "Some Text"
# Check successful parsing
parsed = mail_parser.tika_parse(html)
assert expected_text == parsed.strip()
@@ -160,14 +203,17 @@ class TestParserLive:
html_email_file: Path,
merged_pdf_first: Path,
merged_pdf_second: Path,
):
) -> None:
"""
GIVEN:
- Intermediary pdfs to be merged
- Intermediary PDFs to be merged
- An HTML email file
WHEN:
- pdf generation is requested with html file requiring merging of pdfs
- PDF generation is requested with HTML file requiring merging
THEN:
- gotenberg is called to merge files and the resulting file is returned
- Gotenberg shall be called to merge files
- The resulting merged PDF shall be returned
- The merged PDF shall contain text from both source PDFs
"""
mock_generate_pdf_from_html = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
@@ -200,16 +246,17 @@ class TestParserLive:
html_email_file: Path,
html_email_pdf_file: Path,
html_email_thumbnail_file: Path,
):
) -> None:
"""
GIVEN:
- Fresh start
- An HTML email file
WHEN:
- pdf generation from simple eml file is requested
- PDF generation from the email file is requested
THEN:
- Gotenberg is called and the resulting file is returned and look as expected.
- Gotenberg shall be called to generate the PDF
- The archive PDF shall contain the expected content
- The generated thumbnail shall match the expected image hash
"""
util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"])
# Check the archive PDF
@@ -217,7 +264,7 @@ class TestParserLive:
archive_text = extract_text(archive_path)
expected_archive_text = extract_text(html_email_pdf_file)
# Archive includes the HTML content, so use in
# Archive includes the HTML content
assert expected_archive_text in archive_text
# Check the thumbnail
@@ -227,9 +274,12 @@ class TestParserLive:
)
generated_thumbnail_hash = self.imagehash(generated_thumbnail)
# The created pdf is not reproducible. But the converted image should always look the same.
# The created PDF is not reproducible, but the converted image
# should always look the same
expected_hash = self.imagehash(html_email_thumbnail_file)
assert generated_thumbnail_hash == expected_hash, (
f"PDF looks different. Check if {generated_thumbnail} looks weird."
f"PDF thumbnail differs from expected. "
f"Generated: {generated_thumbnail}, "
f"Hash: {generated_thumbnail_hash} vs {expected_hash}"
)