From 3cf73a77ac504d1be836f128d5000553fe3c3a7d Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 8 Jul 2024 07:46:20 -0700 Subject: [PATCH] Chore: Initial conversion to pytest fixtures (#7110) --- Pipfile | 3 +- Pipfile.lock | 11 +- src/documents/parsers.py | 6 +- src/documents/tests/conftest.py | 9 + src/documents/tests/test_date_parsing.py | 503 ++++++++------ src/paperless_mail/parsers.py | 7 +- src/paperless_mail/tests/conftest.py | 89 +++ src/paperless_mail/tests/test_live_mail.py | 53 +- src/paperless_mail/tests/test_parsers.py | 611 +++++++++--------- src/paperless_mail/tests/test_parsers_live.py | 116 ++-- src/paperless_text/parsers.py | 6 +- src/paperless_text/tests/conftest.py | 30 + src/paperless_text/tests/test_parser.py | 49 +- src/paperless_tika/tests/conftest.py | 40 ++ src/paperless_tika/tests/test_live_tika.py | 107 ++- src/paperless_tika/tests/test_tika_parser.py | 153 ++--- src/paperless_tika/tests/utils.py | 11 - 17 files changed, 1051 insertions(+), 753 deletions(-) create mode 100644 src/documents/tests/conftest.py create mode 100644 src/paperless_mail/tests/conftest.py create mode 100644 src/paperless_text/tests/conftest.py create mode 100644 src/paperless_tika/tests/conftest.py delete mode 100644 src/paperless_tika/tests/utils.py diff --git a/Pipfile b/Pipfile index 69c3084ec..77bb99bcf 100644 --- a/Pipfile +++ b/Pipfile @@ -55,7 +55,7 @@ tqdm = "*" uvicorn = {extras = ["standard"], version = "==0.25.0"} watchdog = "~=4.0" whitenoise = "~=6.7" -whoosh="~=2.7" +whoosh = "~=2.7" zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"} [dev-packages] @@ -71,6 +71,7 @@ pytest-httpx = "*" pytest-env = "*" pytest-sugar = "*" pytest-xdist = "*" +pytest-mock = "*" pytest-rerunfailures = "*" imagehash = "*" daphne = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 6c8a8c724..cda0f7681 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "37d8a84e16b6f6785d0daa79b249beab7fbef0c177a13eccfce79816bf61ccd0" + "sha256": "272a69e9011a60f2d326b77d99d261425b66ebcc8ae929372213700ae47de0f5" }, "pipfile-spec": 6, "requires": {}, @@ -3359,6 +3359,15 @@ "markers": "python_version >= '3.9'", "version": "==0.30.0" }, + "pytest-mock": { + "hashes": [ + "sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f", + "sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==3.14.0" + }, "pytest-rerunfailures": { "hashes": [ "sha256:4197bdd2eaeffdbf50b5ea6e7236f47ff0e44d1def8dae08e409f536d84e7b32", diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 09b1442c0..1297162e2 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -225,11 +225,11 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) - return default_thumbnail_path -def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: +def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path: """ The thumbnail of a PDF is just a 500px wide image of the first page. """ - out_path = os.path.join(temp_dir, "convert.webp") + out_path = temp_dir / "convert.webp" # Run convert to get a decent thumbnail try: @@ -242,7 +242,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: auto_orient=True, use_cropbox=True, input_file=f"{in_path}[0]", - output_file=out_path, + output_file=str(out_path), logging_group=logging_group, ) except ParseError as e: diff --git a/src/documents/tests/conftest.py b/src/documents/tests/conftest.py new file mode 100644 index 000000000..aa86f6e63 --- /dev/null +++ b/src/documents/tests/conftest.py @@ -0,0 +1,9 @@ +import zoneinfo + +import pytest +from pytest_django.fixtures import SettingsWrapper + + +@pytest.fixture() +def settings_timezone(settings: SettingsWrapper) -> zoneinfo.ZoneInfo: + return zoneinfo.ZoneInfo(settings.TIME_ZONE) diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index 253095275..f0afae543 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -1,42 +1,34 @@ import datetime +from zoneinfo import ZoneInfo -from dateutil import tz -from django.conf import settings -from django.test import TestCase -from django.test import override_settings +from pytest_django.fixtures import SettingsWrapper from documents.parsers import parse_date from documents.parsers import parse_date_generator -class TestDate(TestCase): +class TestDate: def test_date_format_1(self): text = "lorem ipsum 130218 lorem ipsum" - self.assertEqual(parse_date("", text), None) + assert parse_date("", text) is None def test_date_format_2(self): text = "lorem ipsum 2018 lorem ipsum" - self.assertEqual(parse_date("", text), None) + assert parse_date("", text) is None def test_date_format_3(self): text = "lorem ipsum 20180213 lorem ipsum" - self.assertEqual(parse_date("", text), None) + assert parse_date("", text) is None - def test_date_format_4(self): + def test_date_format_4(self, settings_timezone: ZoneInfo): text = "lorem ipsum 13.02.2018 lorem ipsum" date = parse_date("", text) - self.assertEqual( - date, - datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) + assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone) - def test_date_format_5(self): + def test_date_format_5(self, settings_timezone: ZoneInfo): text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum" date = parse_date("", text) - self.assertEqual( - date, - datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) + assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone) def test_date_format_6(self): text = ( @@ -50,17 +42,14 @@ class TestDate(TestCase): "BIC\n" "lorem ipsum" ) - self.assertEqual(parse_date("", text), None) + assert parse_date("", text) is None - def test_date_format_7(self): + def test_date_format_7(self, settings_timezone: ZoneInfo): text = "lorem ipsum\nMärz 2019\nlorem ipsum" date = parse_date("", text) - self.assertEqual( - date, - datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) + assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) - def test_date_format_8(self): + def test_date_format_8(self, settings_timezone: ZoneInfo): text = ( "lorem ipsum\n" "Wohnort\n" @@ -73,209 +62,331 @@ class TestDate(TestCase): "lorem ipsum\n" "März 2020" ) - self.assertEqual( - parse_date("", text), - datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2020, + 3, + 1, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_9(self): + def test_date_format_9(self, settings_timezone: ZoneInfo): text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum" - self.assertEqual( - parse_date("", text), - datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2020, + 3, + 1, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_10(self): + def test_date_format_10(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 22, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_11(self): + def test_date_format_11(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 22, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_12(self): + def test_date_format_12(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 22, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_13(self): + def test_date_format_13(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 22, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_14(self): + def test_date_format_14(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 22, + 0, + 0, + tzinfo=settings_timezone, ) def test_date_format_15(self): text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304" - self.assertIsNone(parse_date("", text), None) + assert parse_date("", text) is None def test_date_format_16(self): text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304" - self.assertIsNone(parse_date("", text), None) + assert parse_date("", text) is None def test_date_format_17(self): text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304" - self.assertIsNone(parse_date("", text), None) + assert parse_date("", text) is None def test_date_format_18(self): text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304" - self.assertIsNone(parse_date("", text), None) + assert parse_date("", text) is None - def test_date_format_19(self): + def test_date_format_19(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 21, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_20(self): + def test_date_format_20(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 22, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_21(self): + def test_date_format_21(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 2, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_22(self): + def test_date_format_22(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 23, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_23(self): + def test_date_format_23(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 24, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_24(self): + def test_date_format_24(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 21, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_25(self): + def test_date_format_25(self, settings_timezone: ZoneInfo): text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2022, + 3, + 25, + 0, + 0, + tzinfo=settings_timezone, ) - def test_date_format_26(self): + def test_date_format_26(self, settings_timezone: ZoneInfo): text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051" - self.assertEqual( - parse_date("", text), - datetime.datetime(2019, 9, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2019, + 9, + 25, + 0, + 0, + tzinfo=settings_timezone, ) def test_crazy_date_past(self): - self.assertIsNone(parse_date("", "01-07-0590 00:00:00")) + assert parse_date("", "01-07-0590 00:00:00") is None def test_crazy_date_future(self): - self.assertIsNone(parse_date("", "01-07-2350 00:00:00")) + assert parse_date("", "01-07-2350 00:00:00") is None def test_crazy_date_with_spaces(self): - self.assertIsNone(parse_date("", "20 408000l 2475")) + assert parse_date("", "20 408000l 2475") is None - def test_utf_month_names(self): - self.assertEqual( - parse_date("", "13 décembre 2023"), - datetime.datetime(2023, 12, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + def test_utf_month_names(self, settings_timezone: ZoneInfo): + assert parse_date("", "13 décembre 2023") == datetime.datetime( + 2023, + 12, + 13, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "13 août 2022"), - datetime.datetime(2022, 8, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "13 août 2022") == datetime.datetime( + 2022, + 8, + 13, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "11 März 2020"), - datetime.datetime(2020, 3, 11, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "11 März 2020") == datetime.datetime( + 2020, + 3, + 11, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "17. ožujka 2018."), - datetime.datetime(2018, 3, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "17. ožujka 2018.") == datetime.datetime( + 2018, + 3, + 17, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "1. veljače 2016."), - datetime.datetime(2016, 2, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "1. veljače 2016.") == datetime.datetime( + 2016, + 2, + 1, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "15. února 1985"), - datetime.datetime(1985, 2, 15, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "15. února 1985") == datetime.datetime( + 1985, + 2, + 15, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "30. září 2011"), - datetime.datetime(2011, 9, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "30. září 2011") == datetime.datetime( + 2011, + 9, + 30, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "28. května 1990"), - datetime.datetime(1990, 5, 28, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "28. května 1990") == datetime.datetime( + 1990, + 5, + 28, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "1. grudzień 1997"), - datetime.datetime(1997, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "1. grudzień 1997") == datetime.datetime( + 1997, + 12, + 1, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "17 Şubat 2024"), - datetime.datetime(2024, 2, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "17 Şubat 2024") == datetime.datetime( + 2024, + 2, + 17, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "30 Ağustos 2012"), - datetime.datetime(2012, 8, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "30 Ağustos 2012") == datetime.datetime( + 2012, + 8, + 30, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "17 Eylül 2000"), - datetime.datetime(2000, 9, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "17 Eylül 2000") == datetime.datetime( + 2000, + 9, + 17, + 0, + 0, + tzinfo=settings_timezone, ) - self.assertEqual( - parse_date("", "5. október 1992"), - datetime.datetime(1992, 10, 5, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", "5. október 1992") == datetime.datetime( + 1992, + 10, + 5, + 0, + 0, + tzinfo=settings_timezone, ) - def test_multiple_dates(self): + def test_multiple_dates(self, settings_timezone: ZoneInfo): text = """This text has multiple dates. For example 02.02.2018, 22 July 2022 and December 2021. But not 24-12-9999 because it's in the future...""" dates = list(parse_date_generator("", text)) - self.assertEqual(len(dates), 3) - self.assertEqual( - dates[0], - datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) - self.assertEqual( - dates[1], - datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) - self.assertEqual( - dates[2], - datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) - @override_settings(FILENAME_DATE_ORDER="YMD") - def test_filename_date_parse_valid_ymd(self, *args): + assert dates == [ + datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone), + datetime.datetime( + 2022, + 7, + 22, + 0, + 0, + tzinfo=settings_timezone, + ), + datetime.datetime( + 2021, + 12, + 1, + 0, + 0, + tzinfo=settings_timezone, + ), + ] + + def test_filename_date_parse_valid_ymd( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): """ GIVEN: - Date parsing from the filename is enabled @@ -285,13 +396,18 @@ class TestDate(TestCase): THEN: - Should parse the date from the filename """ - self.assertEqual( - parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"), - datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) + settings.FILENAME_DATE_ORDER = "YMD" - @override_settings(FILENAME_DATE_ORDER="DMY") - def test_filename_date_parse_valid_dmy(self, *args): + assert parse_date( + "/tmp/Scan-2022-04-01.pdf", + "No date in here", + ) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone) + + def test_filename_date_parse_valid_dmy( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): """ GIVEN: - Date parsing from the filename is enabled @@ -301,13 +417,13 @@ class TestDate(TestCase): THEN: - Should parse the date from the filename """ - self.assertEqual( - parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"), - datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) + settings.FILENAME_DATE_ORDER = "DMY" + assert parse_date( + "/tmp/Scan-10.01.2021.pdf", + "No date in here", + ) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone) - @override_settings(FILENAME_DATE_ORDER="YMD") - def test_filename_date_parse_invalid(self, *args): + def test_filename_date_parse_invalid(self, settings: SettingsWrapper): """ GIVEN: - Date parsing from the filename is enabled @@ -317,15 +433,14 @@ class TestDate(TestCase): THEN: - No date is parsed """ - self.assertIsNone( - parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"), - ) + settings.FILENAME_DATE_ORDER = "YMD" + assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None - @override_settings( - FILENAME_DATE_ORDER="YMD", - IGNORE_DATES=(datetime.date(2022, 4, 1),), - ) - def test_filename_date_ignored_use_content(self, *args): + def test_filename_date_ignored_use_content( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): """ GIVEN: - Date parsing from the filename is enabled @@ -338,15 +453,18 @@ class TestDate(TestCase): THEN: - Should parse the date from the content not filename """ - self.assertEqual( - parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"), - datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) + settings.FILENAME_DATE_ORDER = "YMD" + settings.IGNORE_DATES = (datetime.date(2022, 4, 1),) + assert parse_date( + "/tmp/Scan-2022-04-01.pdf", + "The matching date is 24.03.2022", + ) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone) - @override_settings( - IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), - ) - def test_ignored_dates_default_order(self, *args): + def test_ignored_dates_default_order( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): """ GIVEN: - Ignore dates have been set @@ -356,17 +474,22 @@ class TestDate(TestCase): THEN: - Should parse the date non-ignored date from content """ + settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)) text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum" - self.assertEqual( - parse_date("", text), - datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2018, + 2, + 13, + 0, + 0, + tzinfo=settings_timezone, ) - @override_settings( - IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), - DATE_ORDER="YMD", - ) - def test_ignored_dates_order_ymd(self, *args): + def test_ignored_dates_order_ymd( + self, + settings: SettingsWrapper, + settings_timezone: ZoneInfo, + ): """ GIVEN: - Ignore dates have been set @@ -377,9 +500,17 @@ class TestDate(TestCase): THEN: - Should parse the date non-ignored date from content """ + + settings.FILENAME_DATE_ORDER = "YMD" + settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)) + text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum" - self.assertEqual( - parse_date("", text), - datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + assert parse_date("", text) == datetime.datetime( + 2018, + 2, + 13, + 0, + 0, + tzinfo=settings_timezone, ) diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 9047b5f90..4e83844e2 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -52,7 +52,12 @@ class MailDocumentParser(DocumentParser): return PdfAFormat.A3b return None - def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): + def get_thumbnail( + self, + document_path: Path, + mime_type: str, + file_name=None, + ) -> Path: if not self.archive_path: self.archive_path = self.generate_pdf( self.parse_file_to_message(document_path), diff --git a/src/paperless_mail/tests/conftest.py b/src/paperless_mail/tests/conftest.py new file mode 100644 index 000000000..01a98d57d --- /dev/null +++ b/src/paperless_mail/tests/conftest.py @@ -0,0 +1,89 @@ +import os +from collections.abc import Generator +from pathlib import Path + +import pytest + +from paperless_mail.mail import MailAccountHandler +from paperless_mail.models import MailAccount +from paperless_mail.parsers import MailDocumentParser + + +@pytest.fixture(scope="session") +def sample_dir() -> Path: + return (Path(__file__).parent / Path("samples")).resolve() + + +@pytest.fixture(scope="session") +def broken_email_file(sample_dir: Path) -> Path: + return sample_dir / "broken.eml" + + +@pytest.fixture(scope="session") +def simple_txt_email_file(sample_dir: Path) -> Path: + return sample_dir / "simple_text.eml" + + +@pytest.fixture(scope="session") +def simple_txt_email_pdf_file(sample_dir: Path) -> Path: + return sample_dir / "simple_text.eml.pdf" + + +@pytest.fixture(scope="session") +def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path: + return sample_dir / "simple_text.eml.pdf.webp" + + +@pytest.fixture(scope="session") +def html_email_file(sample_dir: Path) -> Path: + return sample_dir / "html.eml" + + +@pytest.fixture(scope="session") +def html_email_pdf_file(sample_dir: Path) -> Path: + return sample_dir / "html.eml.pdf" + + +@pytest.fixture(scope="session") +def html_email_thumbnail_file(sample_dir: Path) -> Path: + return sample_dir / "html.eml.pdf.webp" + + +@pytest.fixture(scope="session") +def html_email_html_file(sample_dir: Path) -> Path: + return sample_dir / "html.eml.html" + + +@pytest.fixture(scope="session") +def merged_pdf_first(sample_dir: Path) -> Path: + return sample_dir / "first.pdf" + + +@pytest.fixture(scope="session") +def merged_pdf_second(sample_dir: Path) -> Path: + return sample_dir / "second.pdf" + + +@pytest.fixture() +def mail_parser() -> MailDocumentParser: + return MailDocumentParser(logging_group=None) + + +@pytest.fixture() +def live_mail_account() -> Generator[MailAccount, None, None]: + try: + account = MailAccount.objects.create( + name="test", + imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"], + username=os.environ["PAPERLESS_MAIL_TEST_USER"], + password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"], + imap_port=993, + ) + yield account + finally: + account.delete() + + +@pytest.fixture() +def mail_account_handler() -> MailAccountHandler: + return MailAccountHandler() diff --git a/src/paperless_mail/tests/test_live_mail.py b/src/paperless_mail/tests/test_live_mail.py index 6de2a6770..ecf9f73b6 100644 --- a/src/paperless_mail/tests/test_live_mail.py +++ b/src/paperless_mail/tests/test_live_mail.py @@ -1,7 +1,7 @@ import os +import warnings import pytest -from django.test import TestCase from paperless_mail.mail import MailAccountHandler from paperless_mail.mail import MailError @@ -16,53 +16,46 @@ from paperless_mail.models import MailRule or not len(os.environ["PAPERLESS_MAIL_TEST_HOST"]), reason="Live server testing not enabled", ) -class TestMailLiveServer(TestCase): - def setUp(self) -> None: - self.mail_account_handler = MailAccountHandler() - self.account = MailAccount.objects.create( - name="test", - imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"], - username=os.environ["PAPERLESS_MAIL_TEST_USER"], - password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"], - imap_port=993, - ) - - return super().setUp() - - def tearDown(self) -> None: - self.account.delete() - return super().tearDown() - - def test_process_non_gmail_server_flag(self): +@pytest.mark.django_db() +class TestMailLiveServer: + def test_process_non_gmail_server_flag( + self, + mail_account_handler: MailAccountHandler, + live_mail_account: MailAccount, + ): try: rule1 = MailRule.objects.create( name="testrule", - account=self.account, + account=live_mail_account, action=MailRule.MailAction.FLAG, ) - self.mail_account_handler.handle_mail_account(self.account) + mail_account_handler.handle_mail_account(live_mail_account) rule1.delete() except MailError as e: - self.fail(f"Failure: {e}") - except Exception: - pass + pytest.fail(f"Failure: {e}") + except Exception as e: + warnings.warn(f"Unhandled exception: {e}") - def test_process_non_gmail_server_tag(self): + def test_process_non_gmail_server_tag( + self, + mail_account_handler: MailAccountHandler, + live_mail_account: MailAccount, + ): try: rule2 = MailRule.objects.create( name="testrule", - account=self.account, + account=live_mail_account, action=MailRule.MailAction.TAG, ) - self.mail_account_handler.handle_mail_account(self.account) + mail_account_handler.handle_mail_account(live_mail_account) rule2.delete() except MailError as e: - self.fail(f"Failure: {e}") - except Exception: - pass + pytest.fail(f"Failure: {e}") + except Exception as e: + warnings.warn(f"Unhandled exception: {e}") diff --git a/src/paperless_mail/tests/test_parsers.py b/src/paperless_mail/tests/test_parsers.py index 5bcff19f6..a0baa4821 100644 --- a/src/paperless_mail/tests/test_parsers.py +++ b/src/paperless_mail/tests/test_parsers.py @@ -1,39 +1,29 @@ import datetime +import logging from pathlib import Path -from unittest import mock import httpx -from django.test import TestCase +import pytest +from django.test.html import parse_html +from pytest_django.fixtures import SettingsWrapper +from pytest_httpx import HTTPXMock +from pytest_mock import MockerFixture from documents.parsers import ParseError -from documents.tests.utils import FileSystemAssertsMixin from paperless_mail.parsers import MailDocumentParser -from paperless_tika.tests.utils import HttpxMockMixin -class BaseMailParserTestCase(TestCase): - """ - Basic setup for the below test cases - """ - - SAMPLE_DIR = Path(__file__).parent / "samples" - - def setUp(self) -> None: - super().setUp() - self.parser = MailDocumentParser(logging_group=None) - - def tearDown(self) -> None: - super().tearDown() - self.parser.cleanup() - - -class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase): +class TestEmailFileParsing: """ Tests around reading a file and parsing it into a MailMessage """ - def test_parse_error_missing_file(self): + def test_parse_error_missing_file( + self, + mail_parser: MailDocumentParser, + sample_dir: Path, + ): """ GIVEN: - Fresh parser @@ -43,17 +33,18 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase): - An Exception is thrown """ # Check if exception is raised when parsing fails. - test_file = self.SAMPLE_DIR / "doesntexist.eml" + test_file = sample_dir / "doesntexist.eml" - self.assertIsNotFile(test_file) - self.assertRaises( - ParseError, - self.parser.parse, - test_file, - "messages/rfc822", - ) + assert not test_file.exists() - def test_parse_error_invalid_email(self): + with pytest.raises(ParseError): + mail_parser.parse(test_file, "messages/rfc822") + + def test_parse_error_invalid_email( + self, + mail_parser: MailDocumentParser, + broken_email_file: Path, + ): """ GIVEN: - Fresh parser @@ -63,14 +54,15 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase): - An Exception is thrown """ # Check if exception is raised when the mail is faulty. - self.assertRaises( - ParseError, - self.parser.parse, - self.SAMPLE_DIR / "broken.eml", - "messages/rfc822", - ) - def test_parse_simple_text_email_file(self): + with pytest.raises(ParseError): + mail_parser.parse(broken_email_file, "messages/rfc822") + + def test_parse_simple_text_email_file( + self, + mail_parser: MailDocumentParser, + simple_txt_email_file: Path, + ): """ GIVEN: - Fresh parser @@ -80,29 +72,31 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase): - The content of the mail should be available in the parse result. """ # Parse Test file and check relevant content - parsed1 = self.parser.parse_file_to_message( - self.SAMPLE_DIR / "simple_text.eml", - ) + parsed_msg = mail_parser.parse_file_to_message(simple_txt_email_file) - self.assertEqual(parsed1.date.year, 2022) - self.assertEqual(parsed1.date.month, 10) - self.assertEqual(parsed1.date.day, 12) - self.assertEqual(parsed1.date.hour, 21) - self.assertEqual(parsed1.date.minute, 40) - self.assertEqual(parsed1.date.second, 43) - self.assertEqual(parsed1.date.tzname(), "UTC+02:00") - self.assertEqual(parsed1.from_, "mail@someserver.de") - self.assertEqual(parsed1.subject, "Simple Text Mail") - self.assertEqual(parsed1.text, "This is just a simple Text Mail.\n") - self.assertEqual(parsed1.to, ("some@one.de",)) + assert parsed_msg.date.year == 2022 + assert parsed_msg.date.month == 10 + assert parsed_msg.date.day == 12 + assert parsed_msg.date.hour == 21 + assert parsed_msg.date.minute == 40 + assert parsed_msg.date.second == 43 + assert parsed_msg.date.tzname() == "UTC+02:00" + assert parsed_msg.from_ == "mail@someserver.de" + assert parsed_msg.subject == "Simple Text Mail" + assert parsed_msg.text == "This is just a simple Text Mail.\n" + assert parsed_msg.to == ("some@one.de",) -class TestEmailMetadataExtraction(BaseMailParserTestCase): +class TestEmailMetadataExtraction: """ Tests extraction of metadata from an email """ - def test_extract_metadata_fail(self): + def test_extract_metadata_fail( + self, + caplog: pytest.LogCaptureFixture, + mail_parser: MailDocumentParser, + ): """ GIVEN: - Fresh start @@ -112,14 +106,20 @@ class TestEmailMetadataExtraction(BaseMailParserTestCase): - A log warning should be generated """ # Validate if warning is logged when parsing fails - with self.assertLogs("paperless.parsing.mail", level="WARNING") as cm: - self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822")) - self.assertIn( - "WARNING:paperless.parsing.mail:Error while fetching document metadata for na", - cm.output[0], - ) + assert mail_parser.extract_metadata("na", "message/rfc822") == [] - def test_extract_metadata(self): + assert len(caplog.records) == 1 + record = caplog.records[0] + + assert record.levelno == logging.WARNING + assert record.name == "paperless.parsing.mail" + assert "Error while fetching document metadata for na" in record.message + + def test_extract_metadata( + self, + mail_parser: MailDocumentParser, + simple_txt_email_file: Path, + ): """ GIVEN: - Fresh start @@ -129,149 +129,110 @@ class TestEmailMetadataExtraction(BaseMailParserTestCase): - metadata is returned """ # Validate Metadata parsing returns the expected results - metadata = self.parser.extract_metadata( - self.SAMPLE_DIR / "simple_text.eml", - "message/rfc822", - ) + metadata = mail_parser.extract_metadata(simple_txt_email_file, "message/rfc822") - self.assertIn( - {"namespace": "", "prefix": "", "key": "attachments", "value": ""}, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "", - "key": "date", - "value": "2022-10-12 21:40:43 UTC+02:00", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "content-language", - "value": "en-US", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "content-type", - "value": "text/plain; charset=UTF-8; format=flowed", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "date", - "value": "Wed, 12 Oct 2022 21:40:43 +0200", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "delivered-to", - "value": "mail@someserver.de", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "from", - "value": "Some One ", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "message-id", - "value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "mime-version", - "value": "1.0", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "received", - "value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from )\n\tfor ; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor ; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "return-path", - "value": "", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "subject", - "value": "Simple Text Mail", - }, - metadata, - ) - self.assertIn( - {"namespace": "", "prefix": "header", "key": "to", "value": "some@one.de"}, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "user-agent", - "value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1", - }, - metadata, - ) - self.assertIn( - { - "namespace": "", - "prefix": "header", - "key": "x-last-tls-session-version", - "value": "TLSv1.3", - }, - metadata, - ) + assert { + "namespace": "", + "prefix": "", + "key": "attachments", + "value": "", + } in metadata + assert { + "namespace": "", + "prefix": "", + "key": "date", + "value": "2022-10-12 21:40:43 UTC+02:00", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "content-language", + "value": "en-US", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "content-type", + "value": "text/plain; charset=UTF-8; format=flowed", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "date", + "value": "Wed, 12 Oct 2022 21:40:43 +0200", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "delivered-to", + "value": "mail@someserver.de", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "from", + "value": "Some One ", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "message-id", + "value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "mime-version", + "value": "1.0", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "received", + "value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from )\n\tfor ; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor ; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "return-path", + "value": "", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "subject", + "value": "Simple Text Mail", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "to", + "value": "some@one.de", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "user-agent", + "value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1", + } in metadata + assert { + "namespace": "", + "prefix": "header", + "key": "x-last-tls-session-version", + "value": "TLSv1.3", + } in metadata -class TestEmailThumbnailGenerate(BaseMailParserTestCase): +class TestEmailThumbnailGenerate: """ Tests the correct generation of an thumbnail for an email """ - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") - @mock.patch("paperless_mail.parsers.make_thumbnail_from_pdf") def test_get_thumbnail( self, - mock_make_thumbnail_from_pdf: mock.MagicMock, - mock_generate_pdf: mock.MagicMock, + mocker: MockerFixture, + mail_parser: MailDocumentParser, + simple_txt_email_file: Path, ): """ GIVEN: @@ -282,29 +243,34 @@ class TestEmailThumbnailGenerate(BaseMailParserTestCase): - The parser should call the functions which generate the thumbnail """ mocked_return = "Passing the return value through.." + mock_make_thumbnail_from_pdf = mocker.patch( + "paperless_mail.parsers.make_thumbnail_from_pdf", + ) mock_make_thumbnail_from_pdf.return_value = mocked_return + mock_generate_pdf = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf", + ) mock_generate_pdf.return_value = "Mocked return value.." - test_file = self.SAMPLE_DIR / "simple_text.eml" - - thumb = self.parser.get_thumbnail( - test_file, - "message/rfc822", - ) + thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822") mock_generate_pdf.assert_called_once() mock_make_thumbnail_from_pdf.assert_called_once_with( "Mocked return value..", - self.parser.tempdir, + mail_parser.tempdir, None, ) - self.assertEqual(mocked_return, thumb) + assert mocked_return == thumb -class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase): - def test_tika_parse_unsuccessful(self): +class TestTikaHtmlParse: + def test_tika_parse_unsuccessful( + self, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + ): """ GIVEN: - Fresh start @@ -314,13 +280,13 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase): - the parser should return an empty string """ # Check unsuccessful parsing - self.httpx_mock.add_response( + httpx_mock.add_response( json={"Content-Type": "text/html", "X-TIKA:Parsed-By": []}, ) - parsed = self.parser.tika_parse("None") - self.assertEqual("", parsed) + parsed = mail_parser.tika_parse("None") + assert parsed == "" - def test_tika_parse(self): + def test_tika_parse(self, httpx_mock: HTTPXMock, mail_parser: MailDocumentParser): """ GIVEN: - Fresh start @@ -332,18 +298,22 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase): html = '

Some Text

' expected_text = "Some Text" - self.httpx_mock.add_response( + httpx_mock.add_response( json={ "Content-Type": "text/html", "X-TIKA:Parsed-By": [], "X-TIKA:content": expected_text, }, ) - parsed = self.parser.tika_parse(html) - self.assertEqual(expected_text, parsed.strip()) - self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url)) + parsed = mail_parser.tika_parse(html) + assert expected_text == parsed.strip() + assert "http://localhost:9998" in str(httpx_mock.get_request().url) - def test_tika_parse_exception(self): + def test_tika_parse_exception( + self, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + ): """ GIVEN: - Fresh start @@ -354,11 +324,16 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase): """ html = '

Some Text

' - self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR) + httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR) - self.assertRaises(ParseError, self.parser.tika_parse, html) + with pytest.raises(ParseError): + mail_parser.tika_parse(html) - def test_tika_parse_unreachable(self): + def test_tika_parse_unreachable( + self, + settings: SettingsWrapper, + mail_parser: MailDocumentParser, + ): """ GIVEN: - Fresh start @@ -370,30 +345,18 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase): html = '

Some Text

' # Check if exception is raised when Tika cannot be reached. - self.parser.tika_server = "" - self.assertRaises(ParseError, self.parser.tika_parse, html) + with pytest.raises(ParseError): + settings.TIKA_ENDPOINT = "http://does-not-exist:9998" + mail_parser.tika_parse(html) -class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase): - def test_parse_no_file(self): - """ - GIVEN: - - Fresh start - WHEN: - - parsing is attempted with nonexistent file - THEN: - - Exception is thrown - """ - # Check if exception is raised when parsing fails. - self.assertRaises( - ParseError, - self.parser.parse, - self.SAMPLE_DIR / "na.eml", - "message/rfc822", - ) - - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") - def test_parse_eml_simple(self, mock_generate_pdf: mock.MagicMock): +class TestParser: + def test_parse_eml_simple( + self, + mocker: MockerFixture, + mail_parser: MailDocumentParser, + simple_txt_email_file: Path, + ): """ GIVEN: - Fresh start @@ -403,11 +366,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) - parsed information is available """ # Validate parsing returns the expected results - - self.parser.parse( - self.SAMPLE_DIR / "simple_text.eml", - "message/rfc822", + mock_generate_pdf = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf", ) + + mail_parser.parse(simple_txt_email_file, "message/rfc822") text_expected = ( "Subject: Simple Text Mail\n\n" "From: Some One \n\n" @@ -416,8 +379,8 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) "BCC: fdf@fvf.de\n\n" "\n\nThis is just a simple Text Mail." ) - self.assertEqual(text_expected, self.parser.text) - self.assertEqual( + assert text_expected == mail_parser.text + assert ( datetime.datetime( 2022, 10, @@ -426,15 +389,20 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) 40, 43, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)), - ), - self.parser.date, + ) + == mail_parser.date ) # Just check if tried to generate archive, the unittest for generate_pdf() goes deeper. mock_generate_pdf.assert_called() - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") - def test_parse_eml_html(self, mock_generate_pdf: mock.MagicMock): + def test_parse_eml_html( + self, + mocker: MockerFixture, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + html_email_file: Path, + ): """ GIVEN: - Fresh start @@ -443,6 +411,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) THEN: - Tika is called, parsed information from non html parts is available """ + + mock_generate_pdf = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf", + ) + # Validate parsing returns the expected results text_expected = ( "Subject: HTML Message\n\n" @@ -453,7 +426,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) "Some Text and an embedded image." ) - self.httpx_mock.add_response( + httpx_mock.add_response( json={ "Content-Type": "text/html", "X-TIKA:Parsed-By": [], @@ -461,11 +434,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) }, ) - self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822") + mail_parser.parse(html_email_file, "message/rfc822") mock_generate_pdf.assert_called_once() - self.assertEqual(text_expected, self.parser.text) - self.assertEqual( + assert text_expected == mail_parser.text + assert ( datetime.datetime( 2022, 10, @@ -474,11 +447,16 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) 23, 19, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)), - ), - self.parser.date, + ) + == mail_parser.date ) - def test_generate_pdf_parse_error(self): + def test_generate_pdf_parse_error( + self, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + simple_txt_email_file: Path, + ): """ GIVEN: - Fresh start @@ -487,16 +465,18 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) THEN: - a ParseError Exception is thrown """ - self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR) + httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR) - self.assertRaises( - ParseError, - self.parser.parse, - self.SAMPLE_DIR / "simple_text.eml", - "message/rfc822", - ) + with pytest.raises(ParseError): + mail_parser.parse(simple_txt_email_file, "message/rfc822") - def test_generate_pdf_simple_email(self): + def test_generate_pdf_simple_email( + self, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + simple_txt_email_file: Path, + simple_txt_email_pdf_file: Path, + ): """ GIVEN: - Simple text email with no HTML content @@ -507,17 +487,23 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) - Archive file is generated """ - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:3000/forms/chromium/convert/html", method="POST", - content=(self.SAMPLE_DIR / "simple_text.eml.pdf").read_bytes(), + content=simple_txt_email_pdf_file.read_bytes(), ) - self.parser.parse(self.SAMPLE_DIR / "simple_text.eml", "message/rfc822") + mail_parser.parse(simple_txt_email_file, "message/rfc822") - self.assertIsNotNone(self.parser.archive_path) + assert mail_parser.archive_path is not None - def test_generate_pdf_html_email(self): + def test_generate_pdf_html_email( + self, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + html_email_file: Path, + html_email_pdf_file: Path, + ): """ GIVEN: - email with HTML content @@ -528,7 +514,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) - Gotenberg is used to merge the two PDFs - Archive file is generated """ - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:9998/tika/text", method="PUT", json={ @@ -537,21 +523,27 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) "X-TIKA:content": "This is some Tika HTML text", }, ) - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:3000/forms/chromium/convert/html", method="POST", - content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(), + content=html_email_pdf_file.read_bytes(), ) - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:3000/forms/pdfengines/merge", method="POST", content=b"Pretend merged PDF content", ) - self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822") + mail_parser.parse(html_email_file, "message/rfc822") - self.assertIsNotNone(self.parser.archive_path) + assert mail_parser.archive_path is not None - def test_generate_pdf_html_email_html_to_pdf_failure(self): + def test_generate_pdf_html_email_html_to_pdf_failure( + self, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + html_email_file: Path, + html_email_pdf_file: Path, + ): """ GIVEN: - email with HTML content @@ -561,7 +553,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) THEN: - ParseError is raised """ - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:9998/tika/text", method="PUT", json={ @@ -570,20 +562,26 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) "X-TIKA:content": "This is some Tika HTML text", }, ) - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:3000/forms/chromium/convert/html", method="POST", - content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(), + content=html_email_pdf_file.read_bytes(), ) - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:3000/forms/chromium/convert/html", method="POST", status_code=httpx.codes.INTERNAL_SERVER_ERROR, ) - with self.assertRaises(ParseError): - self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822") + with pytest.raises(ParseError): + mail_parser.parse(html_email_file, "message/rfc822") - def test_generate_pdf_html_email_merge_failure(self): + def test_generate_pdf_html_email_merge_failure( + self, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + html_email_file: Path, + html_email_pdf_file: Path, + ): """ GIVEN: - email with HTML content @@ -593,7 +591,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) THEN: - ParseError is raised """ - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:9998/tika/text", method="PUT", json={ @@ -602,20 +600,25 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) "X-TIKA:content": "This is some Tika HTML text", }, ) - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:3000/forms/chromium/convert/html", method="POST", - content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(), + content=html_email_pdf_file.read_bytes(), ) - self.httpx_mock.add_response( + httpx_mock.add_response( url="http://localhost:3000/forms/pdfengines/merge", method="POST", status_code=httpx.codes.INTERNAL_SERVER_ERROR, ) - with self.assertRaises(ParseError): - self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822") + with pytest.raises(ParseError): + mail_parser.parse(html_email_file, "message/rfc822") - def test_mail_to_html(self): + def test_mail_to_html( + self, + mail_parser: MailDocumentParser, + html_email_file: Path, + html_email_html_file: Path, + ): """ GIVEN: - Email message with HTML content @@ -624,14 +627,19 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) THEN: - Resulting HTML is as expected """ - mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml") - html_file = self.parser.mail_to_html(mail) - expected_html_file = self.SAMPLE_DIR / "html.eml.html" + mail = mail_parser.parse_file_to_message(html_email_file) + html_file = mail_parser.mail_to_html(mail) - self.assertHTMLEqual(expected_html_file.read_text(), html_file.read_text()) + expected_html = parse_html(html_email_html_file.read_text()) + actual_html = parse_html(html_file.read_text()) + + assert expected_html == actual_html def test_generate_pdf_from_mail( self, + httpx_mock: HTTPXMock, + mail_parser: MailDocumentParser, + html_email_file: Path, ): """ GIVEN: @@ -642,16 +650,13 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase) - Gotenberg is used to convert HTML to PDF """ - self.httpx_mock.add_response(content=b"Content") + httpx_mock.add_response(content=b"Content") - mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml") + mail = mail_parser.parse_file_to_message(html_email_file) - retval = self.parser.generate_pdf_from_mail(mail) - self.assertEqual(b"Content", retval.read_bytes()) + retval = mail_parser.generate_pdf_from_mail(mail) + assert retval.read_bytes() == b"Content" - request = self.httpx_mock.get_request() + request = httpx_mock.get_request() - self.assertEqual( - str(request.url), - "http://localhost:3000/forms/chromium/convert/html", - ) + assert str(request.url) == "http://localhost:3000/forms/chromium/convert/html" diff --git a/src/paperless_mail/tests/test_parsers_live.py b/src/paperless_mail/tests/test_parsers_live.py index 3260725a5..9e13ad25e 100644 --- a/src/paperless_mail/tests/test_parsers_live.py +++ b/src/paperless_mail/tests/test_parsers_live.py @@ -3,17 +3,15 @@ import shutil import subprocess import tempfile from pathlib import Path -from unittest import mock import httpx import pytest -from django.test import TestCase from imagehash import average_hash from PIL import Image +from pytest_mock import MockerFixture -from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import util_call_with_backoff -from paperless_mail.tests.test_parsers import BaseMailParserTestCase +from paperless_mail.parsers import MailDocumentParser def extract_text(pdf_path: Path) -> str: @@ -50,7 +48,7 @@ class MailAttachmentMock: "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) -class TestUrlCanary(TestCase): +class TestUrlCanary: """ Verify certain URLs are still available so testing is valid still """ @@ -69,13 +67,13 @@ class TestUrlCanary(TestCase): whether this image stays online forever, so here we check if we can detect if is not available anymore. """ - with self.assertRaises(httpx.HTTPStatusError) as cm: + with pytest.raises(httpx.HTTPStatusError) as exec_info: resp = httpx.get( "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png", ) resp.raise_for_status() - self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND) + assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND def test_is_online_image_still_available(self): """ @@ -100,13 +98,19 @@ class TestUrlCanary(TestCase): "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) -class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): +class TestParserLive: @staticmethod def imagehash(file, hash_size=18): return f"{average_hash(Image.open(file), hash_size)}" - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") - def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock): + def test_get_thumbnail( + self, + mocker: MockerFixture, + mail_parser: MailDocumentParser, + simple_txt_email_file: Path, + simple_txt_email_pdf_file: Path, + simple_txt_email_thumbnail_file: Path, + ): """ GIVEN: - Fresh start @@ -115,22 +119,21 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): THEN: - The returned thumbnail image file is as expected """ - mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf" - thumb = self.parser.get_thumbnail( - self.SAMPLE_DIR / "simple_text.eml", - "message/rfc822", + mock_generate_pdf = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf", ) - self.assertIsFile(thumb) + mock_generate_pdf.return_value = simple_txt_email_pdf_file - expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp" + thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822") - self.assertEqual( - self.imagehash(thumb), - self.imagehash(expected), - f"Created Thumbnail {thumb} differs from expected file {expected}", - ) + assert thumb.exists() + assert thumb.is_file() - def test_tika_parse_successful(self): + assert ( + self.imagehash(thumb) == self.imagehash(simple_txt_email_thumbnail_file) + ), f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}" + + def test_tika_parse_successful(self, mail_parser: MailDocumentParser): """ GIVEN: - Fresh start @@ -143,15 +146,16 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): expected_text = "Some Text" # Check successful parsing - parsed = self.parser.tika_parse(html) - self.assertEqual(expected_text, parsed.strip()) + parsed = mail_parser.tika_parse(html) + assert expected_text == parsed.strip() - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail") - @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html") def test_generate_pdf_gotenberg_merging( self, - mock_generate_pdf_from_html: mock.MagicMock, - mock_generate_pdf_from_mail: mock.MagicMock, + mocker: MockerFixture, + mail_parser: MailDocumentParser, + html_email_file: Path, + merged_pdf_first: Path, + merged_pdf_second: Path, ): """ GIVEN: @@ -161,61 +165,67 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): THEN: - gotenberg is called to merge files and the resulting file is returned """ - mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf" - mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf" - - msg = self.parser.parse_file_to_message( - self.SAMPLE_DIR / "html.eml", + mock_generate_pdf_from_html = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html", ) + mock_generate_pdf_from_mail = mocker.patch( + "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail", + ) + mock_generate_pdf_from_mail.return_value = merged_pdf_first + mock_generate_pdf_from_html.return_value = merged_pdf_second + + msg = mail_parser.parse_file_to_message(html_email_file) _, pdf_path = util_call_with_backoff( - self.parser.generate_pdf, + mail_parser.generate_pdf, [msg], ) - self.assertIsFile(pdf_path) + assert pdf_path.exists() + assert pdf_path.is_file() extracted = extract_text(pdf_path) expected = ( "first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c" ) - self.assertEqual(expected, extracted) + assert expected == extracted - def test_generate_pdf_from_mail(self): + def test_generate_pdf_from_mail( + self, + mail_parser: MailDocumentParser, + html_email_file: Path, + html_email_pdf_file: Path, + html_email_thumbnail_file: Path, + ): """ GIVEN: - Fresh start WHEN: - pdf generation from simple eml file is requested THEN: - - gotenberg is called and the resulting file is returned and look as expected. + - Gotenberg is called and the resulting file is returned and look as expected. """ - util_call_with_backoff( - self.parser.parse, - [self.SAMPLE_DIR / "html.eml", "message/rfc822"], - ) + util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"]) # Check the archive PDF - archive_path = self.parser.get_archive_path() + archive_path = mail_parser.get_archive_path() archive_text = extract_text(archive_path) - expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf") + expected_archive_text = extract_text(html_email_pdf_file) # Archive includes the HTML content, so use in - self.assertIn(expected_archive_text, archive_text) + assert expected_archive_text in archive_text # Check the thumbnail - generated_thumbnail = self.parser.get_thumbnail( - self.SAMPLE_DIR / "html.eml", + generated_thumbnail = mail_parser.get_thumbnail( + html_email_file, "message/rfc822", ) generated_thumbnail_hash = self.imagehash(generated_thumbnail) # The created pdf is not reproducible. But the converted image should always look the same. - expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp") + expected_hash = self.imagehash(html_email_thumbnail_file) - self.assertEqual( - generated_thumbnail_hash, - expected_hash, - f"PDF looks different. Check if {generated_thumbnail} looks weird.", - ) + assert ( + generated_thumbnail_hash == expected_hash + ), f"PDF looks different. Check if {generated_thumbnail} looks weird." diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index b6481adc9..58df11d7a 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -1,4 +1,4 @@ -import os +from pathlib import Path from django.conf import settings from PIL import Image @@ -15,7 +15,7 @@ class TextDocumentParser(DocumentParser): logging_name = "paperless.parsing.text" - def get_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path: text = self.read_file_handle_unicode_errors(document_path) img = Image.new("RGB", (500, 700), color="white") @@ -27,7 +27,7 @@ class TextDocumentParser(DocumentParser): ) draw.text((5, 5), text, font=font, fill="black") - out_path = os.path.join(self.tempdir, "thumb.webp") + out_path = self.tempdir / "thumb.webp" img.save(out_path, format="WEBP") return out_path diff --git a/src/paperless_text/tests/conftest.py b/src/paperless_text/tests/conftest.py new file mode 100644 index 000000000..1d9e4fc2f --- /dev/null +++ b/src/paperless_text/tests/conftest.py @@ -0,0 +1,30 @@ +from collections.abc import Generator +from pathlib import Path + +import pytest + +from paperless_text.parsers import TextDocumentParser + + +@pytest.fixture(scope="session") +def sample_dir() -> Path: + return (Path(__file__).parent / Path("samples")).resolve() + + +@pytest.fixture() +def text_parser() -> Generator[TextDocumentParser, None, None]: + try: + parser = TextDocumentParser(logging_group=None) + yield parser + finally: + parser.cleanup() + + +@pytest.fixture(scope="session") +def sample_txt_file(sample_dir: Path) -> Path: + return sample_dir / "test.txt" + + +@pytest.fixture(scope="session") +def malformed_txt_file(sample_dir: Path) -> Path: + return sample_dir / "decode_error.txt" diff --git a/src/paperless_text/tests/test_parser.py b/src/paperless_text/tests/test_parser.py index cc5ce76fe..0f8cc19ba 100644 --- a/src/paperless_text/tests/test_parser.py +++ b/src/paperless_text/tests/test_parser.py @@ -1,37 +1,26 @@ from pathlib import Path -from django.test import TestCase - -from documents.tests.utils import DirectoriesMixin -from documents.tests.utils import FileSystemAssertsMixin from paperless_text.parsers import TextDocumentParser -class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - SAMPLE_DIR = Path(__file__).resolve().parent / "samples" - - def test_thumbnail(self): - parser = TextDocumentParser(None) - +class TestTextParser: + def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path): # just make sure that it does not crash - f = parser.get_thumbnail( - self.SAMPLE_DIR / "test.txt", - "text/plain", - ) - self.assertIsFile(f) + f = text_parser.get_thumbnail(sample_txt_file, "text/plain") + assert f.exists() + assert f.is_file() - def test_parse(self): - parser = TextDocumentParser(None) + def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path): + text_parser.parse(sample_txt_file, "text/plain") - parser.parse( - self.SAMPLE_DIR / "test.txt", - "text/plain", - ) + assert text_parser.get_text() == "This is a test file.\n" + assert text_parser.get_archive_path() is None - self.assertEqual(parser.get_text(), "This is a test file.\n") - self.assertIsNone(parser.get_archive_path()) - - def test_parse_invalid_bytes(self): + def test_parse_invalid_bytes( + self, + text_parser: TextDocumentParser, + malformed_txt_file: Path, + ): """ GIVEN: - Text file which contains invalid UTF bytes @@ -41,12 +30,8 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - Parsing continues - Invalid bytes are removed """ - parser = TextDocumentParser(None) - parser.parse( - self.SAMPLE_DIR / "decode_error.txt", - "text/plain", - ) + text_parser.parse(malformed_txt_file, "text/plain") - self.assertEqual(parser.get_text(), "Pantothens�ure\n") - self.assertIsNone(parser.get_archive_path()) + assert text_parser.get_text() == "Pantothens�ure\n" + assert text_parser.get_archive_path() is None diff --git a/src/paperless_tika/tests/conftest.py b/src/paperless_tika/tests/conftest.py new file mode 100644 index 000000000..657192e4e --- /dev/null +++ b/src/paperless_tika/tests/conftest.py @@ -0,0 +1,40 @@ +from collections.abc import Generator +from pathlib import Path + +import pytest + +from paperless_tika.parsers import TikaDocumentParser + + +@pytest.fixture() +def tika_parser() -> Generator[TikaDocumentParser, None, None]: + try: + parser = TikaDocumentParser(logging_group=None) + yield parser + finally: + parser.cleanup() + + +@pytest.fixture(scope="session") +def sample_dir() -> Path: + return (Path(__file__).parent / Path("samples")).resolve() + + +@pytest.fixture(scope="session") +def sample_odt_file(sample_dir: Path) -> Path: + return sample_dir / "sample.odt" + + +@pytest.fixture(scope="session") +def sample_docx_file(sample_dir: Path) -> Path: + return sample_dir / "sample.docx" + + +@pytest.fixture(scope="session") +def sample_doc_file(sample_dir: Path) -> Path: + return sample_dir / "sample.doc" + + +@pytest.fixture(scope="session") +def sample_broken_odt(sample_dir: Path) -> Path: + return sample_dir / "multi-part-broken.odt" diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py index 1c6225bdc..7d8cffffd 100644 --- a/src/paperless_tika/tests/test_live_tika.py +++ b/src/paperless_tika/tests/test_live_tika.py @@ -1,9 +1,7 @@ import os from pathlib import Path -from typing import Final import pytest -from django.test import TestCase from documents.tests.utils import util_call_with_backoff from paperless_tika.parsers import TikaDocumentParser @@ -13,22 +11,19 @@ from paperless_tika.parsers import TikaDocumentParser "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) -class TestTikaParserAgainstServer(TestCase): +@pytest.mark.django_db() +class TestTikaParserAgainstServer: """ This test case tests the Tika parsing against a live tika server, if the environment contains the correct value indicating such a server is available. """ - SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve() - - def setUp(self) -> None: - self.parser = TikaDocumentParser(logging_group=None) - - def tearDown(self) -> None: - self.parser.cleanup() - - def test_basic_parse_odt(self): + def test_basic_parse_odt( + self, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): """ GIVEN: - An input ODT format document @@ -38,26 +33,26 @@ class TestTikaParserAgainstServer(TestCase): - Document content is correct - Document date is correct """ - test_file = self.SAMPLE_DIR / Path("sample.odt") - util_call_with_backoff( - self.parser.parse, - [test_file, "application/vnd.oasis.opendocument.text"], + tika_parser.parse, + [sample_odt_file, "application/vnd.oasis.opendocument.text"], ) - self.assertEqual( - self.parser.text, - "This is an ODT test document, created September 14, 2022", + assert ( + tika_parser.text + == "This is an ODT test document, created September 14, 2022" ) - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - # PDFs begin with the bytes PDF-x.y - self.assertTrue(b"PDF-" in f.read()[:10]) + assert tika_parser.archive_path is not None + assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10] # TODO: Unsure what can set the Creation-Date field in a document, enable when possible - # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) + # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14)) - def test_basic_parse_docx(self): + def test_basic_parse_docx( + self, + tika_parser: TikaDocumentParser, + sample_docx_file: Path, + ): """ GIVEN: - An input DOCX format document @@ -67,27 +62,29 @@ class TestTikaParserAgainstServer(TestCase): - Document content is correct - Document date is correct """ - test_file = self.SAMPLE_DIR / Path("sample.docx") - util_call_with_backoff( - self.parser.parse, + tika_parser.parse, [ - test_file, + sample_docx_file, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ], ) - self.assertEqual( - self.parser.text, - "This is an DOCX test document, also made September 14, 2022", + assert ( + tika_parser.text + == "This is an DOCX test document, also made September 14, 2022" ) - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - self.assertTrue(b"PDF-" in f.read()[:10]) + assert tika_parser.archive_path is not None + with open(tika_parser.archive_path, "rb") as f: + assert b"PDF-" in f.read()[:10] - # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) + # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14)) - def test_basic_parse_doc(self): + def test_basic_parse_doc( + self, + tika_parser: TikaDocumentParser, + sample_doc_file: Path, + ): """ GIVEN: - An input DOC format document @@ -97,22 +94,24 @@ class TestTikaParserAgainstServer(TestCase): - Document content is correct - Document date is correct """ - test_file = self.SAMPLE_DIR / "sample.doc" - util_call_with_backoff( - self.parser.parse, - [test_file, "application/msword"], + tika_parser.parse, + [sample_doc_file, "application/msword"], ) - self.assertIn( - "his is a test document, saved in the older .doc format", - self.parser.text, + assert ( + "This is a test document, saved in the older .doc format" + in tika_parser.text ) - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - self.assertTrue(b"PDF-" in f.read()[:10]) + assert tika_parser.archive_path is not None + with open(tika_parser.archive_path, "rb") as f: + assert b"PDF-" in f.read()[:10] - def test_tika_fails_multi_part(self): + def test_tika_fails_multi_part( + self, + tika_parser: TikaDocumentParser, + sample_broken_odt: Path, + ): """ GIVEN: - An input ODT format document @@ -125,13 +124,11 @@ class TestTikaParserAgainstServer(TestCase): See also: - https://issues.apache.org/jira/browse/TIKA-4110 """ - test_file = self.SAMPLE_DIR / "multi-part-broken.odt" - util_call_with_backoff( - self.parser.parse, - [test_file, "application/vnd.oasis.opendocument.text"], + tika_parser.parse, + [sample_broken_odt, "application/vnd.oasis.opendocument.text"], ) - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - self.assertTrue(b"PDF-" in f.read()[:10]) + assert tika_parser.archive_path is not None + with open(tika_parser.archive_path, "rb") as f: + assert b"PDF-" in f.read()[:10] diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py index ee010eb49..6b048f252 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless_tika/tests/test_tika_parser.py @@ -1,30 +1,30 @@ import datetime -import os import zoneinfo +from http import HTTPStatus from pathlib import Path -from django.test import TestCase -from django.test import override_settings +import pytest from httpx import codes from httpx._multipart import DataField -from rest_framework import status +from pytest_django.fixtures import SettingsWrapper +from pytest_httpx import HTTPXMock from documents.parsers import ParseError from paperless_tika.parsers import TikaDocumentParser -from paperless_tika.tests.utils import HttpxMockMixin -class TestTikaParser(HttpxMockMixin, TestCase): - def setUp(self) -> None: - self.parser = TikaDocumentParser(logging_group=None) - - def tearDown(self) -> None: - self.parser.cleanup() - - @override_settings(TIME_ZONE="America/Chicago") - def test_parse(self): +@pytest.mark.django_db() +class TestTikaParser: + def test_parse( + self, + httpx_mock: HTTPXMock, + settings: SettingsWrapper, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): + settings.TIME_ZONE = "America/Chicago" # Pretend parse response - self.httpx_mock.add_response( + httpx_mock.add_response( json={ "Content-Type": "application/vnd.oasis.opendocument.text", "X-TIKA:Parsed-By": [], @@ -33,30 +33,29 @@ class TestTikaParser(HttpxMockMixin, TestCase): }, ) # Pretend convert to PDF response - self.httpx_mock.add_response(content=b"PDF document") + httpx_mock.add_response(content=b"PDF document") - file = Path(os.path.join(self.parser.tempdir, "input.odt")) - file.touch() + tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text") - self.parser.parse(file, "application/vnd.oasis.opendocument.text") + assert tika_parser.text == "the content" + assert tika_parser.archive_path is not None + with open(tika_parser.archive_path, "rb") as f: + assert f.read() == b"PDF document" - self.assertEqual(self.parser.text, "the content") - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - self.assertEqual(f.read(), b"PDF document") - - self.assertEqual( - self.parser.date, - datetime.datetime( - 2020, - 11, - 21, - tzinfo=zoneinfo.ZoneInfo("America/Chicago"), - ), + assert tika_parser.date == datetime.datetime( + 2020, + 11, + 21, + tzinfo=zoneinfo.ZoneInfo("America/Chicago"), ) - def test_metadata(self): - self.httpx_mock.add_response( + def test_metadata( + self, + httpx_mock: HTTPXMock, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): + httpx_mock.add_response( json={ "Content-Type": "application/vnd.oasis.opendocument.text", "X-TIKA:Parsed-By": [], @@ -65,18 +64,20 @@ class TestTikaParser(HttpxMockMixin, TestCase): }, ) - file = Path(os.path.join(self.parser.tempdir, "input.odt")) - file.touch() - - metadata = self.parser.extract_metadata( - file, + metadata = tika_parser.extract_metadata( + sample_odt_file, "application/vnd.oasis.opendocument.text", ) - self.assertTrue("dcterms:created" in [m["key"] for m in metadata]) - self.assertTrue("Some-key" in [m["key"] for m in metadata]) + assert "dcterms:created" in [m["key"] for m in metadata] + assert "Some-key" in [m["key"] for m in metadata] - def test_convert_failure(self): + def test_convert_failure( + self, + httpx_mock: HTTPXMock, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): """ GIVEN: - Document needs to be converted to PDF @@ -86,15 +87,29 @@ class TestTikaParser(HttpxMockMixin, TestCase): - Parse error is raised """ # Pretend convert to PDF response - self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) + httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) - file = Path(os.path.join(self.parser.tempdir, "input.odt")) - file.touch() + with pytest.raises(ParseError): + tika_parser.convert_to_pdf(sample_odt_file, None) - with self.assertRaises(ParseError): - self.parser.convert_to_pdf(file, None) - - def test_request_pdf_a_format(self): + @pytest.mark.parametrize( + ("setting_value", "expected_form_value"), + [ + ("pdfa", "PDF/A-2b"), + ("pdfa-1", "PDF/A-2b"), + ("pdfa-2", "PDF/A-2b"), + ("pdfa-3", "PDF/A-3b"), + ], + ) + def test_request_pdf_a_format( + self, + setting_value: str, + expected_form_value: str, + httpx_mock: HTTPXMock, + settings: SettingsWrapper, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): """ GIVEN: - Document needs to be converted to PDF @@ -103,31 +118,21 @@ class TestTikaParser(HttpxMockMixin, TestCase): THEN: - Request to Gotenberg contains the expected PDF/A format string """ - file = Path(os.path.join(self.parser.tempdir, "input.odt")) - file.touch() + settings.OCR_OUTPUT_TYPE = setting_value + httpx_mock.add_response( + status_code=codes.OK, + content=b"PDF document", + method="POST", + ) - for setting, expected_key in [ - ("pdfa", "PDF/A-2b"), - ("pdfa-2", "PDF/A-2b"), - ("pdfa-1", "PDF/A-2b"), - ("pdfa-3", "PDF/A-3b"), - ]: - with override_settings(OCR_OUTPUT_TYPE=setting): - self.httpx_mock.add_response( - status_code=codes.OK, - content=b"PDF document", - method="POST", - ) + tika_parser.convert_to_pdf(sample_odt_file, None) - self.parser.convert_to_pdf(file, None) + request = httpx_mock.get_request() + found = False + for field in request.stream.fields: + if isinstance(field, DataField) and field.name == "pdfa": + assert field.value == expected_form_value + found = True + assert found, "pdfFormat was not found" - request = self.httpx_mock.get_request() - found = False - for field in request.stream.fields: - if isinstance(field, DataField) and field.name == "pdfa": - self.assertEqual(field.value, expected_key) - found = True - break - self.assertTrue(found) - - self.httpx_mock.reset(assert_all_responses_were_requested=False) + httpx_mock.reset(assert_all_responses_were_requested=False) diff --git a/src/paperless_tika/tests/utils.py b/src/paperless_tika/tests/utils.py deleted file mode 100644 index b26f79ec6..000000000 --- a/src/paperless_tika/tests/utils.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest -from pytest_httpx import HTTPXMock - - -class HttpxMockMixin: - @pytest.fixture(autouse=True) - def httpx_mock_auto(self, httpx_mock: HTTPXMock): - """ - Workaround for allowing use of a fixture with unittest style testing - """ - self.httpx_mock = httpx_mock