From 01b21377afb4b042864b955f1bc87a69b22d2ed5 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 27 Jan 2026 19:57:12 -0800 Subject: [PATCH] Chore: Use a local http server instead of external to reduce flakiness (#11916) --- docker/compose/docker-compose.ci-test.yml | 10 ++ docker/compose/test-nginx.conf | 14 ++ pyproject.toml | 8 + .../tests/test_management_consumer.py | 4 +- src/paperless_mail/tests/conftest.py | 8 + src/paperless_mail/tests/samples/html.eml | 2 +- src/paperless_mail/tests/samples/sample.html | 2 +- src/paperless_mail/tests/test_live_mail.py | 2 + src/paperless_mail/tests/test_parsers_live.py | 148 ++++++++++++------ 9 files changed, 145 insertions(+), 53 deletions(-) create mode 100644 docker/compose/test-nginx.conf diff --git a/docker/compose/docker-compose.ci-test.yml b/docker/compose/docker-compose.ci-test.yml index f07f7fadb..d227ac038 100644 --- a/docker/compose/docker-compose.ci-test.yml +++ b/docker/compose/docker-compose.ci-test.yml @@ -34,3 +34,13 @@ services: ports: - "3143:3143" # IMAP restart: unless-stopped + nginx: + image: docker.io/nginx:1.29-alpine + hostname: nginx + container_name: nginx + ports: + - "8080:8080" + restart: unless-stopped + volumes: + - ../../docs/assets:/usr/share/nginx/html/assets:ro + - ./test-nginx.conf:/etc/nginx/conf.d/default.conf:ro diff --git a/docker/compose/test-nginx.conf b/docker/compose/test-nginx.conf new file mode 100644 index 000000000..e90f3fad3 --- /dev/null +++ b/docker/compose/test-nginx.conf @@ -0,0 +1,14 @@ +server { + listen 8080; + server_name localhost; + + root /usr/share/nginx/html; + + # Enable CORS for test requests + add_header 'Access-Control-Allow-Origin' '*' always; + add_header 'Access-Control-Allow-Methods' 'GET, HEAD, OPTIONS' always; + + location / { + try_files $uri $uri/ =404; + } +} diff --git a/pyproject.toml b/pyproject.toml index ac6c39b2b..34474feda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -300,6 +300,14 @@ norecursedirs = [ "src/locale/", ".venv/", "src-ui/" ] DJANGO_SETTINGS_MODULE = "paperless.settings" +markers = [ + "live: Integration tests requiring external services (Gotenberg, Tika, nginx, etc)", + "nginx: Tests that make HTTP requests to the local nginx service", + "gotenberg: Tests requiring Gotenberg service", + "tika: Tests requiring Tika service", + "greenmail: Tests requiring Greenmail service", +] + [tool.pytest_env] PAPERLESS_DISABLE_DBHANDLER = "true" PAPERLESS_CACHE_BACKEND = "django.core.cache.backends.locmem.LocMemCache" diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index 46aa3d374..314f29d89 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -816,7 +816,7 @@ class TestCommandWatch: f.flush() sleep(0.05) - sleep(0.5) + sleep(0.8) if thread.exception: raise thread.exception @@ -837,7 +837,7 @@ class TestCommandWatch: (consumption_dir / "._document.pdf").write_bytes(b"test") shutil.copy(sample_pdf, consumption_dir / "valid.pdf") - sleep(0.5) + sleep(0.8) if thread.exception: raise thread.exception diff --git a/src/paperless_mail/tests/conftest.py b/src/paperless_mail/tests/conftest.py index d6b74dfbf..0742edfa3 100644 --- a/src/paperless_mail/tests/conftest.py +++ b/src/paperless_mail/tests/conftest.py @@ -89,3 +89,11 @@ def greenmail_mail_account(db: None) -> Generator[MailAccount, None, None]: @pytest.fixture() def mail_account_handler() -> MailAccountHandler: return MailAccountHandler() + + +@pytest.fixture(scope="session") +def nginx_base_url() -> Generator[str, None, None]: + """ + The base URL for the nginx HTTP server we expect to be alive + """ + yield "http://localhost:8080" diff --git a/src/paperless_mail/tests/samples/html.eml b/src/paperless_mail/tests/samples/html.eml index aaac68cc4..c912acc44 100644 --- a/src/paperless_mail/tests/samples/html.eml +++ b/src/paperless_mail/tests/samples/html.eml @@ -55,7 +55,7 @@ Content-Transfer-Encoding: 7bit

Some Text

Has to be rewritten to work.. - This image should not be shown. + This image should not be shown.

and an embedded image.
diff --git a/src/paperless_mail/tests/samples/sample.html b/src/paperless_mail/tests/samples/sample.html index c1fd52d43..9d535dba6 100644 --- a/src/paperless_mail/tests/samples/sample.html +++ b/src/paperless_mail/tests/samples/sample.html @@ -6,7 +6,7 @@

Some Text

Has to be rewritten to work.. - This image should not be shown. + This image should not be shown.

and an embedded image.
diff --git a/src/paperless_mail/tests/test_live_mail.py b/src/paperless_mail/tests/test_live_mail.py index c7dcffadd..cfd7f88d0 100644 --- a/src/paperless_mail/tests/test_live_mail.py +++ b/src/paperless_mail/tests/test_live_mail.py @@ -6,6 +6,8 @@ from paperless_mail.models import MailAccount from paperless_mail.models import MailRule +@pytest.mark.live +@pytest.mark.greenmail @pytest.mark.django_db class TestMailGreenmail: """ diff --git a/src/paperless_mail/tests/test_parsers_live.py b/src/paperless_mail/tests/test_parsers_live.py index fd052cc26..8a9487c16 100644 --- a/src/paperless_mail/tests/test_parsers_live.py +++ b/src/paperless_mail/tests/test_parsers_live.py @@ -17,7 +17,7 @@ from paperless_mail.parsers import MailDocumentParser def extract_text(pdf_path: Path) -> str: """ Using pdftotext from poppler, extracts the text of a PDF into a file, - then reads the file contents and returns it + then reads the file contents and returns it. """ with tempfile.NamedTemporaryFile( mode="w+", @@ -38,71 +38,107 @@ def extract_text(pdf_path: Path) -> str: class MailAttachmentMock: - def __init__(self, payload, content_id): + def __init__(self, payload: bytes, content_id: str) -> None: self.payload = payload self.content_id = content_id self.content_type = "image/png" +@pytest.mark.live +@pytest.mark.nginx @pytest.mark.skipif( "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) -class TestUrlCanary: +class TestNginxService: """ - Verify certain URLs are still available so testing is valid still + Verify the local nginx server is responding correctly. + These tests validate that the test infrastructure is working properly + before running the actual parser tests that depend on HTTP resources. """ - def test_online_image_exception_on_not_available(self): + def test_non_existent_resource_returns_404( + self, + nginx_base_url: str, + ) -> None: """ GIVEN: - - Fresh start + - Local nginx server is running WHEN: - - nonexistent image is requested + - A non-existent resource is requested THEN: - - An exception shall be thrown - """ - """ - A public image is used in the html sample file. We have no control - whether this image stays online forever, so here we check if we can detect if is not - available anymore. + - An HTTP 404 status code shall be returned """ resp = httpx.get( - "https://docs.paperless-ngx.com/assets/non-existent.png", + f"{nginx_base_url}/assets/non-existent.png", + timeout=5.0, ) with pytest.raises(httpx.HTTPStatusError) as exec_info: resp.raise_for_status() assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND - def test_is_online_image_still_available(self): + def test_valid_resource_is_available( + self, + nginx_base_url: str, + ) -> None: """ GIVEN: - - Fresh start + - Local nginx server is running WHEN: - - A public image used in the html sample file is requested + - A valid test fixture resource is requested THEN: - - No exception shall be thrown + - The resource shall be returned with HTTP 200 status code + - The response shall contain the expected content type """ - """ - A public image is used in the html sample file. We have no control - whether this image stays online forever, so here we check if it is still there - """ - - # Now check the URL used in samples/sample.html resp = httpx.get( - "https://docs.paperless-ngx.com/assets/logo_full_white.svg", + f"{nginx_base_url}/assets/logo_full_white.svg", + timeout=5.0, ) resp.raise_for_status() + assert resp.status_code == httpx.codes.OK + assert "svg" in resp.headers.get("content-type", "").lower() + def test_server_connectivity( + self, + nginx_base_url: str, + ) -> None: + """ + GIVEN: + - Local test fixtures server should be running + WHEN: + - A request is made to the server root + THEN: + - The server shall respond without connection errors + """ + try: + resp = httpx.get( + nginx_base_url, + timeout=5.0, + follow_redirects=True, + ) + # We don't care about the status code, just that we can connect + assert resp.status_code in {200, 404, 403} + except httpx.ConnectError as e: + pytest.fail( + f"Cannot connect to nginx server at {nginx_base_url}. " + f"Ensure the nginx container is running via docker-compose.ci-test.yml. " + f"Error: {e}", + ) + + +@pytest.mark.live +@pytest.mark.gotenberg +@pytest.mark.tika +@pytest.mark.nginx @pytest.mark.skipif( "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) class TestParserLive: @staticmethod - def imagehash(file, hash_size=18): + def imagehash(file: Path, hash_size: int = 18) -> str: return f"{average_hash(Image.open(file), hash_size)}" def test_get_thumbnail( @@ -112,14 +148,15 @@ class TestParserLive: simple_txt_email_file: Path, simple_txt_email_pdf_file: Path, simple_txt_email_thumbnail_file: Path, - ): + ) -> None: """ GIVEN: - - Fresh start + - A simple text email file + - Mocked PDF generation returning a known PDF WHEN: - - The Thumbnail is requested + - The thumbnail is requested THEN: - - The returned thumbnail image file is as expected + - The returned thumbnail image file shall match the expected hash """ mock_generate_pdf = mocker.patch( "paperless_mail.parsers.MailDocumentParser.generate_pdf", @@ -134,22 +171,28 @@ class TestParserLive: assert self.imagehash(thumb) == self.imagehash( simple_txt_email_thumbnail_file, ), ( - f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}" + f"Created thumbnail {thumb} differs from expected file " + f"{simple_txt_email_thumbnail_file}" ) - def test_tika_parse_successful(self, mail_parser: MailDocumentParser): + def test_tika_parse_successful(self, mail_parser: MailDocumentParser) -> None: """ GIVEN: - - Fresh start + - HTML content to parse + - Tika server is running WHEN: - - tika parsing is called + - Tika parsing is called THEN: - - a web request to tika shall be done and the reply es returned + - A web request to Tika shall be made + - The parsed text content shall be returned """ - html = '

Some Text

' + html = ( + '' + "

Some Text

" + ) expected_text = "Some Text" - # Check successful parsing parsed = mail_parser.tika_parse(html) assert expected_text == parsed.strip() @@ -160,14 +203,17 @@ class TestParserLive: html_email_file: Path, merged_pdf_first: Path, merged_pdf_second: Path, - ): + ) -> None: """ GIVEN: - - Intermediary pdfs to be merged + - Intermediary PDFs to be merged + - An HTML email file WHEN: - - pdf generation is requested with html file requiring merging of pdfs + - PDF generation is requested with HTML file requiring merging THEN: - - gotenberg is called to merge files and the resulting file is returned + - Gotenberg shall be called to merge files + - The resulting merged PDF shall be returned + - The merged PDF shall contain text from both source PDFs """ mock_generate_pdf_from_html = mocker.patch( "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html", @@ -200,16 +246,17 @@ class TestParserLive: html_email_file: Path, html_email_pdf_file: Path, html_email_thumbnail_file: Path, - ): + ) -> None: """ GIVEN: - - Fresh start + - An HTML email file WHEN: - - pdf generation from simple eml file is requested + - PDF generation from the email file is requested THEN: - - Gotenberg is called and the resulting file is returned and look as expected. + - Gotenberg shall be called to generate the PDF + - The archive PDF shall contain the expected content + - The generated thumbnail shall match the expected image hash """ - util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"]) # Check the archive PDF @@ -217,7 +264,7 @@ class TestParserLive: archive_text = extract_text(archive_path) expected_archive_text = extract_text(html_email_pdf_file) - # Archive includes the HTML content, so use in + # Archive includes the HTML content assert expected_archive_text in archive_text # Check the thumbnail @@ -227,9 +274,12 @@ class TestParserLive: ) generated_thumbnail_hash = self.imagehash(generated_thumbnail) - # The created pdf is not reproducible. But the converted image should always look the same. + # The created PDF is not reproducible, but the converted image + # should always look the same expected_hash = self.imagehash(html_email_thumbnail_file) assert generated_thumbnail_hash == expected_hash, ( - f"PDF looks different. Check if {generated_thumbnail} looks weird." + f"PDF thumbnail differs from expected. " + f"Generated: {generated_thumbnail}, " + f"Hash: {generated_thumbnail_hash} vs {expected_hash}" )