Chore: Initial conversion to pytest fixtures (#7110)

This commit is contained in:
Trenton H 2024-07-08 07:46:20 -07:00 committed by GitHub
parent 1b9cf5121b
commit 3cf73a77ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 1051 additions and 753 deletions

View File

@ -71,6 +71,7 @@ pytest-httpx = "*"
pytest-env = "*" pytest-env = "*"
pytest-sugar = "*" pytest-sugar = "*"
pytest-xdist = "*" pytest-xdist = "*"
pytest-mock = "*"
pytest-rerunfailures = "*" pytest-rerunfailures = "*"
imagehash = "*" imagehash = "*"
daphne = "*" daphne = "*"

11
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "37d8a84e16b6f6785d0daa79b249beab7fbef0c177a13eccfce79816bf61ccd0" "sha256": "272a69e9011a60f2d326b77d99d261425b66ebcc8ae929372213700ae47de0f5"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": {}, "requires": {},
@ -3359,6 +3359,15 @@
"markers": "python_version >= '3.9'", "markers": "python_version >= '3.9'",
"version": "==0.30.0" "version": "==0.30.0"
}, },
"pytest-mock": {
"hashes": [
"sha256:0b72c38033392a5f4621342fe11e9219ac11ec9d375f8e2a0c164539e0d70f6f",
"sha256:2719255a1efeceadbc056d6bf3df3d1c5015530fb40cf347c0f9afac88410bd0"
],
"index": "pypi",
"markers": "python_version >= '3.8'",
"version": "==3.14.0"
},
"pytest-rerunfailures": { "pytest-rerunfailures": {
"hashes": [ "hashes": [
"sha256:4197bdd2eaeffdbf50b5ea6e7236f47ff0e44d1def8dae08e409f536d84e7b32", "sha256:4197bdd2eaeffdbf50b5ea6e7236f47ff0e44d1def8dae08e409f536d84e7b32",

View File

@ -225,11 +225,11 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
return default_thumbnail_path return default_thumbnail_path
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
""" """
The thumbnail of a PDF is just a 500px wide image of the first page. The thumbnail of a PDF is just a 500px wide image of the first page.
""" """
out_path = os.path.join(temp_dir, "convert.webp") out_path = temp_dir / "convert.webp"
# Run convert to get a decent thumbnail # Run convert to get a decent thumbnail
try: try:
@ -242,7 +242,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
auto_orient=True, auto_orient=True,
use_cropbox=True, use_cropbox=True,
input_file=f"{in_path}[0]", input_file=f"{in_path}[0]",
output_file=out_path, output_file=str(out_path),
logging_group=logging_group, logging_group=logging_group,
) )
except ParseError as e: except ParseError as e:

View File

@ -0,0 +1,9 @@
import zoneinfo
import pytest
from pytest_django.fixtures import SettingsWrapper
@pytest.fixture()
def settings_timezone(settings: SettingsWrapper) -> zoneinfo.ZoneInfo:
return zoneinfo.ZoneInfo(settings.TIME_ZONE)

View File

@ -1,42 +1,34 @@
import datetime import datetime
from zoneinfo import ZoneInfo
from dateutil import tz from pytest_django.fixtures import SettingsWrapper
from django.conf import settings
from django.test import TestCase
from django.test import override_settings
from documents.parsers import parse_date from documents.parsers import parse_date
from documents.parsers import parse_date_generator from documents.parsers import parse_date_generator
class TestDate(TestCase): class TestDate:
def test_date_format_1(self): def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum" text = "lorem ipsum 130218 lorem ipsum"
self.assertEqual(parse_date("", text), None) assert parse_date("", text) is None
def test_date_format_2(self): def test_date_format_2(self):
text = "lorem ipsum 2018 lorem ipsum" text = "lorem ipsum 2018 lorem ipsum"
self.assertEqual(parse_date("", text), None) assert parse_date("", text) is None
def test_date_format_3(self): def test_date_format_3(self):
text = "lorem ipsum 20180213 lorem ipsum" text = "lorem ipsum 20180213 lorem ipsum"
self.assertEqual(parse_date("", text), None) assert parse_date("", text) is None
def test_date_format_4(self): def test_date_format_4(self, settings_timezone: ZoneInfo):
text = "lorem ipsum 13.02.2018 lorem ipsum" text = "lorem ipsum 13.02.2018 lorem ipsum"
date = parse_date("", text) date = parse_date("", text)
self.assertEqual( assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
date,
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_5(self): def test_date_format_5(self, settings_timezone: ZoneInfo):
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum" text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
date = parse_date("", text) date = parse_date("", text)
self.assertEqual( assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
date,
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_6(self): def test_date_format_6(self):
text = ( text = (
@ -50,17 +42,14 @@ class TestDate(TestCase):
"BIC\n" "BIC\n"
"lorem ipsum" "lorem ipsum"
) )
self.assertEqual(parse_date("", text), None) assert parse_date("", text) is None
def test_date_format_7(self): def test_date_format_7(self, settings_timezone: ZoneInfo):
text = "lorem ipsum\nMärz 2019\nlorem ipsum" text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text) date = parse_date("", text)
self.assertEqual( assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
date,
datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_8(self): def test_date_format_8(self, settings_timezone: ZoneInfo):
text = ( text = (
"lorem ipsum\n" "lorem ipsum\n"
"Wohnort\n" "Wohnort\n"
@ -73,209 +62,331 @@ class TestDate(TestCase):
"lorem ipsum\n" "lorem ipsum\n"
"März 2020" "März 2020"
) )
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2020,
datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
1,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_9(self): def test_date_format_9(self, settings_timezone: ZoneInfo):
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum" text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2020,
datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
1,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_10(self): def test_date_format_10(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304" text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
22,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_11(self): def test_date_format_11(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304" text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
22,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_12(self): def test_date_format_12(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304" text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
22,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_13(self): def test_date_format_13(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304" text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
22,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_14(self): def test_date_format_14(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304" text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
22,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_15(self): def test_date_format_15(self):
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304" text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None) assert parse_date("", text) is None
def test_date_format_16(self): def test_date_format_16(self):
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304" text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None) assert parse_date("", text) is None
def test_date_format_17(self): def test_date_format_17(self):
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304" text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None) assert parse_date("", text) is None
def test_date_format_18(self): def test_date_format_18(self):
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304" text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None) assert parse_date("", text) is None
def test_date_format_19(self): def test_date_format_19(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304" text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
21,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_20(self): def test_date_format_20(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304" text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
22,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_21(self): def test_date_format_21(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304" text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
2,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_22(self): def test_date_format_22(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304" text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
23,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_23(self): def test_date_format_23(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304" text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
24,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_24(self): def test_date_format_24(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304" text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
21,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_25(self): def test_date_format_25(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304" text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2022,
datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
25,
0,
0,
tzinfo=settings_timezone,
) )
def test_date_format_26(self): def test_date_format_26(self, settings_timezone: ZoneInfo):
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051" text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2019,
datetime.datetime(2019, 9, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 9,
25,
0,
0,
tzinfo=settings_timezone,
) )
def test_crazy_date_past(self): def test_crazy_date_past(self):
self.assertIsNone(parse_date("", "01-07-0590 00:00:00")) assert parse_date("", "01-07-0590 00:00:00") is None
def test_crazy_date_future(self): def test_crazy_date_future(self):
self.assertIsNone(parse_date("", "01-07-2350 00:00:00")) assert parse_date("", "01-07-2350 00:00:00") is None
def test_crazy_date_with_spaces(self): def test_crazy_date_with_spaces(self):
self.assertIsNone(parse_date("", "20 408000l 2475")) assert parse_date("", "20 408000l 2475") is None
def test_utf_month_names(self): def test_utf_month_names(self, settings_timezone: ZoneInfo):
self.assertEqual( assert parse_date("", "13 décembre 2023") == datetime.datetime(
parse_date("", "13 décembre 2023"), 2023,
datetime.datetime(2023, 12, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 12,
13,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "13 août 2022") == datetime.datetime(
parse_date("", "13 août 2022"), 2022,
datetime.datetime(2022, 8, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 8,
13,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "11 März 2020") == datetime.datetime(
parse_date("", "11 März 2020"), 2020,
datetime.datetime(2020, 3, 11, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
11,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
parse_date("", "17. ožujka 2018."), 2018,
datetime.datetime(2018, 3, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 3,
17,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "1. veljače 2016.") == datetime.datetime(
parse_date("", "1. veljače 2016."), 2016,
datetime.datetime(2016, 2, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 2,
1,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "15. února 1985") == datetime.datetime(
parse_date("", "15. února 1985"), 1985,
datetime.datetime(1985, 2, 15, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 2,
15,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "30. září 2011") == datetime.datetime(
parse_date("", "30. září 2011"), 2011,
datetime.datetime(2011, 9, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 9,
30,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "28. května 1990") == datetime.datetime(
parse_date("", "28. května 1990"), 1990,
datetime.datetime(1990, 5, 28, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 5,
28,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "1. grudzień 1997") == datetime.datetime(
parse_date("", "1. grudzień 1997"), 1997,
datetime.datetime(1997, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 12,
1,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "17 Şubat 2024") == datetime.datetime(
parse_date("", "17 Şubat 2024"), 2024,
datetime.datetime(2024, 2, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 2,
17,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
parse_date("", "30 Ağustos 2012"), 2012,
datetime.datetime(2012, 8, 30, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 8,
30,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "17 Eylül 2000") == datetime.datetime(
parse_date("", "17 Eylül 2000"), 2000,
datetime.datetime(2000, 9, 17, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 9,
17,
0,
0,
tzinfo=settings_timezone,
) )
self.assertEqual( assert parse_date("", "5. október 1992") == datetime.datetime(
parse_date("", "5. október 1992"), 1992,
datetime.datetime(1992, 10, 5, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 10,
5,
0,
0,
tzinfo=settings_timezone,
) )
def test_multiple_dates(self): def test_multiple_dates(self, settings_timezone: ZoneInfo):
text = """This text has multiple dates. text = """This text has multiple dates.
For example 02.02.2018, 22 July 2022 and December 2021. For example 02.02.2018, 22 July 2022 and December 2021.
But not 24-12-9999 because it's in the future...""" But not 24-12-9999 because it's in the future..."""
dates = list(parse_date_generator("", text)) dates = list(parse_date_generator("", text))
self.assertEqual(len(dates), 3)
self.assertEqual(
dates[0],
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[1],
datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[2],
datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD") assert dates == [
def test_filename_date_parse_valid_ymd(self, *args): datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
datetime.datetime(
2022,
7,
22,
0,
0,
tzinfo=settings_timezone,
),
datetime.datetime(
2021,
12,
1,
0,
0,
tzinfo=settings_timezone,
),
]
def test_filename_date_parse_valid_ymd(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
""" """
GIVEN: GIVEN:
- Date parsing from the filename is enabled - Date parsing from the filename is enabled
@ -285,13 +396,18 @@ class TestDate(TestCase):
THEN: THEN:
- Should parse the date from the filename - Should parse the date from the filename
""" """
self.assertEqual( settings.FILENAME_DATE_ORDER = "YMD"
parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"),
datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="DMY") assert parse_date(
def test_filename_date_parse_valid_dmy(self, *args): "/tmp/Scan-2022-04-01.pdf",
"No date in here",
) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
def test_filename_date_parse_valid_dmy(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
""" """
GIVEN: GIVEN:
- Date parsing from the filename is enabled - Date parsing from the filename is enabled
@ -301,13 +417,13 @@ class TestDate(TestCase):
THEN: THEN:
- Should parse the date from the filename - Should parse the date from the filename
""" """
self.assertEqual( settings.FILENAME_DATE_ORDER = "DMY"
parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"), assert parse_date(
datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), "/tmp/Scan-10.01.2021.pdf",
) "No date in here",
) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
@override_settings(FILENAME_DATE_ORDER="YMD") def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
def test_filename_date_parse_invalid(self, *args):
""" """
GIVEN: GIVEN:
- Date parsing from the filename is enabled - Date parsing from the filename is enabled
@ -317,15 +433,14 @@ class TestDate(TestCase):
THEN: THEN:
- No date is parsed - No date is parsed
""" """
self.assertIsNone( settings.FILENAME_DATE_ORDER = "YMD"
parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"), assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
)
@override_settings( def test_filename_date_ignored_use_content(
FILENAME_DATE_ORDER="YMD", self,
IGNORE_DATES=(datetime.date(2022, 4, 1),), settings: SettingsWrapper,
) settings_timezone: ZoneInfo,
def test_filename_date_ignored_use_content(self, *args): ):
""" """
GIVEN: GIVEN:
- Date parsing from the filename is enabled - Date parsing from the filename is enabled
@ -338,15 +453,18 @@ class TestDate(TestCase):
THEN: THEN:
- Should parse the date from the content not filename - Should parse the date from the content not filename
""" """
self.assertEqual( settings.FILENAME_DATE_ORDER = "YMD"
parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"), settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), assert parse_date(
) "/tmp/Scan-2022-04-01.pdf",
"The matching date is 24.03.2022",
) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
@override_settings( def test_ignored_dates_default_order(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), self,
) settings: SettingsWrapper,
def test_ignored_dates_default_order(self, *args): settings_timezone: ZoneInfo,
):
""" """
GIVEN: GIVEN:
- Ignore dates have been set - Ignore dates have been set
@ -356,17 +474,22 @@ class TestDate(TestCase):
THEN: THEN:
- Should parse the date non-ignored date from content - Should parse the date non-ignored date from content
""" """
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum" text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2018,
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 2,
13,
0,
0,
tzinfo=settings_timezone,
) )
@override_settings( def test_ignored_dates_order_ymd(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), self,
DATE_ORDER="YMD", settings: SettingsWrapper,
) settings_timezone: ZoneInfo,
def test_ignored_dates_order_ymd(self, *args): ):
""" """
GIVEN: GIVEN:
- Ignore dates have been set - Ignore dates have been set
@ -377,9 +500,17 @@ class TestDate(TestCase):
THEN: THEN:
- Should parse the date non-ignored date from content - Should parse the date non-ignored date from content
""" """
settings.FILENAME_DATE_ORDER = "YMD"
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum" text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
self.assertEqual( assert parse_date("", text) == datetime.datetime(
parse_date("", text), 2018,
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), 2,
13,
0,
0,
tzinfo=settings_timezone,
) )

View File

@ -52,7 +52,12 @@ class MailDocumentParser(DocumentParser):
return PdfAFormat.A3b return PdfAFormat.A3b
return None return None
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None): def get_thumbnail(
self,
document_path: Path,
mime_type: str,
file_name=None,
) -> Path:
if not self.archive_path: if not self.archive_path:
self.archive_path = self.generate_pdf( self.archive_path = self.generate_pdf(
self.parse_file_to_message(document_path), self.parse_file_to_message(document_path),

View File

@ -0,0 +1,89 @@
import os
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_mail.mail import MailAccountHandler
from paperless_mail.models import MailAccount
from paperless_mail.parsers import MailDocumentParser
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture(scope="session")
def broken_email_file(sample_dir: Path) -> Path:
return sample_dir / "broken.eml"
@pytest.fixture(scope="session")
def simple_txt_email_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml"
@pytest.fixture(scope="session")
def simple_txt_email_pdf_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml.pdf"
@pytest.fixture(scope="session")
def simple_txt_email_thumbnail_file(sample_dir: Path) -> Path:
return sample_dir / "simple_text.eml.pdf.webp"
@pytest.fixture(scope="session")
def html_email_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml"
@pytest.fixture(scope="session")
def html_email_pdf_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.pdf"
@pytest.fixture(scope="session")
def html_email_thumbnail_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.pdf.webp"
@pytest.fixture(scope="session")
def html_email_html_file(sample_dir: Path) -> Path:
return sample_dir / "html.eml.html"
@pytest.fixture(scope="session")
def merged_pdf_first(sample_dir: Path) -> Path:
return sample_dir / "first.pdf"
@pytest.fixture(scope="session")
def merged_pdf_second(sample_dir: Path) -> Path:
return sample_dir / "second.pdf"
@pytest.fixture()
def mail_parser() -> MailDocumentParser:
return MailDocumentParser(logging_group=None)
@pytest.fixture()
def live_mail_account() -> Generator[MailAccount, None, None]:
try:
account = MailAccount.objects.create(
name="test",
imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"],
username=os.environ["PAPERLESS_MAIL_TEST_USER"],
password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"],
imap_port=993,
)
yield account
finally:
account.delete()
@pytest.fixture()
def mail_account_handler() -> MailAccountHandler:
return MailAccountHandler()

View File

@ -1,7 +1,7 @@
import os import os
import warnings
import pytest import pytest
from django.test import TestCase
from paperless_mail.mail import MailAccountHandler from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailError from paperless_mail.mail import MailError
@ -16,53 +16,46 @@ from paperless_mail.models import MailRule
or not len(os.environ["PAPERLESS_MAIL_TEST_HOST"]), or not len(os.environ["PAPERLESS_MAIL_TEST_HOST"]),
reason="Live server testing not enabled", reason="Live server testing not enabled",
) )
class TestMailLiveServer(TestCase): @pytest.mark.django_db()
def setUp(self) -> None: class TestMailLiveServer:
self.mail_account_handler = MailAccountHandler() def test_process_non_gmail_server_flag(
self.account = MailAccount.objects.create( self,
name="test", mail_account_handler: MailAccountHandler,
imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"], live_mail_account: MailAccount,
username=os.environ["PAPERLESS_MAIL_TEST_USER"], ):
password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"],
imap_port=993,
)
return super().setUp()
def tearDown(self) -> None:
self.account.delete()
return super().tearDown()
def test_process_non_gmail_server_flag(self):
try: try:
rule1 = MailRule.objects.create( rule1 = MailRule.objects.create(
name="testrule", name="testrule",
account=self.account, account=live_mail_account,
action=MailRule.MailAction.FLAG, action=MailRule.MailAction.FLAG,
) )
self.mail_account_handler.handle_mail_account(self.account) mail_account_handler.handle_mail_account(live_mail_account)
rule1.delete() rule1.delete()
except MailError as e: except MailError as e:
self.fail(f"Failure: {e}") pytest.fail(f"Failure: {e}")
except Exception: except Exception as e:
pass warnings.warn(f"Unhandled exception: {e}")
def test_process_non_gmail_server_tag(self): def test_process_non_gmail_server_tag(
self,
mail_account_handler: MailAccountHandler,
live_mail_account: MailAccount,
):
try: try:
rule2 = MailRule.objects.create( rule2 = MailRule.objects.create(
name="testrule", name="testrule",
account=self.account, account=live_mail_account,
action=MailRule.MailAction.TAG, action=MailRule.MailAction.TAG,
) )
self.mail_account_handler.handle_mail_account(self.account) mail_account_handler.handle_mail_account(live_mail_account)
rule2.delete() rule2.delete()
except MailError as e: except MailError as e:
self.fail(f"Failure: {e}") pytest.fail(f"Failure: {e}")
except Exception: except Exception as e:
pass warnings.warn(f"Unhandled exception: {e}")

View File

@ -1,39 +1,29 @@
import datetime import datetime
import logging
from pathlib import Path from pathlib import Path
from unittest import mock
import httpx import httpx
from django.test import TestCase import pytest
from django.test.html import parse_html
from pytest_django.fixtures import SettingsWrapper
from pytest_httpx import HTTPXMock
from pytest_mock import MockerFixture
from documents.parsers import ParseError from documents.parsers import ParseError
from documents.tests.utils import FileSystemAssertsMixin
from paperless_mail.parsers import MailDocumentParser from paperless_mail.parsers import MailDocumentParser
from paperless_tika.tests.utils import HttpxMockMixin
class BaseMailParserTestCase(TestCase): class TestEmailFileParsing:
"""
Basic setup for the below test cases
"""
SAMPLE_DIR = Path(__file__).parent / "samples"
def setUp(self) -> None:
super().setUp()
self.parser = MailDocumentParser(logging_group=None)
def tearDown(self) -> None:
super().tearDown()
self.parser.cleanup()
class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
""" """
Tests around reading a file and parsing it into a Tests around reading a file and parsing it into a
MailMessage MailMessage
""" """
def test_parse_error_missing_file(self): def test_parse_error_missing_file(
self,
mail_parser: MailDocumentParser,
sample_dir: Path,
):
""" """
GIVEN: GIVEN:
- Fresh parser - Fresh parser
@ -43,17 +33,18 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
- An Exception is thrown - An Exception is thrown
""" """
# Check if exception is raised when parsing fails. # Check if exception is raised when parsing fails.
test_file = self.SAMPLE_DIR / "doesntexist.eml" test_file = sample_dir / "doesntexist.eml"
self.assertIsNotFile(test_file) assert not test_file.exists()
self.assertRaises(
ParseError,
self.parser.parse,
test_file,
"messages/rfc822",
)
def test_parse_error_invalid_email(self): with pytest.raises(ParseError):
mail_parser.parse(test_file, "messages/rfc822")
def test_parse_error_invalid_email(
self,
mail_parser: MailDocumentParser,
broken_email_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh parser - Fresh parser
@ -63,14 +54,15 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
- An Exception is thrown - An Exception is thrown
""" """
# Check if exception is raised when the mail is faulty. # Check if exception is raised when the mail is faulty.
self.assertRaises(
ParseError,
self.parser.parse,
self.SAMPLE_DIR / "broken.eml",
"messages/rfc822",
)
def test_parse_simple_text_email_file(self): with pytest.raises(ParseError):
mail_parser.parse(broken_email_file, "messages/rfc822")
def test_parse_simple_text_email_file(
self,
mail_parser: MailDocumentParser,
simple_txt_email_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh parser - Fresh parser
@ -80,29 +72,31 @@ class TestEmailFileParsing(FileSystemAssertsMixin, BaseMailParserTestCase):
- The content of the mail should be available in the parse result. - The content of the mail should be available in the parse result.
""" """
# Parse Test file and check relevant content # Parse Test file and check relevant content
parsed1 = self.parser.parse_file_to_message( parsed_msg = mail_parser.parse_file_to_message(simple_txt_email_file)
self.SAMPLE_DIR / "simple_text.eml",
)
self.assertEqual(parsed1.date.year, 2022) assert parsed_msg.date.year == 2022
self.assertEqual(parsed1.date.month, 10) assert parsed_msg.date.month == 10
self.assertEqual(parsed1.date.day, 12) assert parsed_msg.date.day == 12
self.assertEqual(parsed1.date.hour, 21) assert parsed_msg.date.hour == 21
self.assertEqual(parsed1.date.minute, 40) assert parsed_msg.date.minute == 40
self.assertEqual(parsed1.date.second, 43) assert parsed_msg.date.second == 43
self.assertEqual(parsed1.date.tzname(), "UTC+02:00") assert parsed_msg.date.tzname() == "UTC+02:00"
self.assertEqual(parsed1.from_, "mail@someserver.de") assert parsed_msg.from_ == "mail@someserver.de"
self.assertEqual(parsed1.subject, "Simple Text Mail") assert parsed_msg.subject == "Simple Text Mail"
self.assertEqual(parsed1.text, "This is just a simple Text Mail.\n") assert parsed_msg.text == "This is just a simple Text Mail.\n"
self.assertEqual(parsed1.to, ("some@one.de",)) assert parsed_msg.to == ("some@one.de",)
class TestEmailMetadataExtraction(BaseMailParserTestCase): class TestEmailMetadataExtraction:
""" """
Tests extraction of metadata from an email Tests extraction of metadata from an email
""" """
def test_extract_metadata_fail(self): def test_extract_metadata_fail(
self,
caplog: pytest.LogCaptureFixture,
mail_parser: MailDocumentParser,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -112,14 +106,20 @@ class TestEmailMetadataExtraction(BaseMailParserTestCase):
- A log warning should be generated - A log warning should be generated
""" """
# Validate if warning is logged when parsing fails # Validate if warning is logged when parsing fails
with self.assertLogs("paperless.parsing.mail", level="WARNING") as cm: assert mail_parser.extract_metadata("na", "message/rfc822") == []
self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822"))
self.assertIn(
"WARNING:paperless.parsing.mail:Error while fetching document metadata for na",
cm.output[0],
)
def test_extract_metadata(self): assert len(caplog.records) == 1
record = caplog.records[0]
assert record.levelno == logging.WARNING
assert record.name == "paperless.parsing.mail"
assert "Error while fetching document metadata for na" in record.message
def test_extract_metadata(
self,
mail_parser: MailDocumentParser,
simple_txt_email_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -129,149 +129,110 @@ class TestEmailMetadataExtraction(BaseMailParserTestCase):
- metadata is returned - metadata is returned
""" """
# Validate Metadata parsing returns the expected results # Validate Metadata parsing returns the expected results
metadata = self.parser.extract_metadata( metadata = mail_parser.extract_metadata(simple_txt_email_file, "message/rfc822")
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
)
self.assertIn( assert {
{"namespace": "", "prefix": "", "key": "attachments", "value": ""}, "namespace": "",
metadata, "prefix": "",
) "key": "attachments",
self.assertIn( "value": "",
{ } in metadata
assert {
"namespace": "", "namespace": "",
"prefix": "", "prefix": "",
"key": "date", "key": "date",
"value": "2022-10-12 21:40:43 UTC+02:00", "value": "2022-10-12 21:40:43 UTC+02:00",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "content-language", "key": "content-language",
"value": "en-US", "value": "en-US",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "content-type", "key": "content-type",
"value": "text/plain; charset=UTF-8; format=flowed", "value": "text/plain; charset=UTF-8; format=flowed",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "date", "key": "date",
"value": "Wed, 12 Oct 2022 21:40:43 +0200", "value": "Wed, 12 Oct 2022 21:40:43 +0200",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "delivered-to", "key": "delivered-to",
"value": "mail@someserver.de", "value": "mail@someserver.de",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "from", "key": "from",
"value": "Some One <mail@someserver.de>", "value": "Some One <mail@someserver.de>",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "message-id", "key": "message-id",
"value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>", "value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "mime-version", "key": "mime-version",
"value": "1.0", "value": "1.0",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "received", "key": "received",
"value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from <mail@someserver.de>)\n\tfor <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)", "value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from <mail@someserver.de>)\n\tfor <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "return-path", "key": "return-path",
"value": "<mail@someserver.de>", "value": "<mail@someserver.de>",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "subject", "key": "subject",
"value": "Simple Text Mail", "value": "Simple Text Mail",
}, } in metadata
metadata, assert {
) "namespace": "",
self.assertIn( "prefix": "header",
{"namespace": "", "prefix": "header", "key": "to", "value": "some@one.de"}, "key": "to",
metadata, "value": "some@one.de",
) } in metadata
self.assertIn( assert {
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "user-agent", "key": "user-agent",
"value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1", "value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1",
}, } in metadata
metadata, assert {
)
self.assertIn(
{
"namespace": "", "namespace": "",
"prefix": "header", "prefix": "header",
"key": "x-last-tls-session-version", "key": "x-last-tls-session-version",
"value": "TLSv1.3", "value": "TLSv1.3",
}, } in metadata
metadata,
)
class TestEmailThumbnailGenerate(BaseMailParserTestCase): class TestEmailThumbnailGenerate:
""" """
Tests the correct generation of an thumbnail for an email Tests the correct generation of an thumbnail for an email
""" """
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
@mock.patch("paperless_mail.parsers.make_thumbnail_from_pdf")
def test_get_thumbnail( def test_get_thumbnail(
self, self,
mock_make_thumbnail_from_pdf: mock.MagicMock, mocker: MockerFixture,
mock_generate_pdf: mock.MagicMock, mail_parser: MailDocumentParser,
simple_txt_email_file: Path,
): ):
""" """
GIVEN: GIVEN:
@ -282,29 +243,34 @@ class TestEmailThumbnailGenerate(BaseMailParserTestCase):
- The parser should call the functions which generate the thumbnail - The parser should call the functions which generate the thumbnail
""" """
mocked_return = "Passing the return value through.." mocked_return = "Passing the return value through.."
mock_make_thumbnail_from_pdf = mocker.patch(
"paperless_mail.parsers.make_thumbnail_from_pdf",
)
mock_make_thumbnail_from_pdf.return_value = mocked_return mock_make_thumbnail_from_pdf.return_value = mocked_return
mock_generate_pdf = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
mock_generate_pdf.return_value = "Mocked return value.." mock_generate_pdf.return_value = "Mocked return value.."
test_file = self.SAMPLE_DIR / "simple_text.eml" thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
thumb = self.parser.get_thumbnail(
test_file,
"message/rfc822",
)
mock_generate_pdf.assert_called_once() mock_generate_pdf.assert_called_once()
mock_make_thumbnail_from_pdf.assert_called_once_with( mock_make_thumbnail_from_pdf.assert_called_once_with(
"Mocked return value..", "Mocked return value..",
self.parser.tempdir, mail_parser.tempdir,
None, None,
) )
self.assertEqual(mocked_return, thumb) assert mocked_return == thumb
class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase): class TestTikaHtmlParse:
def test_tika_parse_unsuccessful(self): def test_tika_parse_unsuccessful(
self,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -314,13 +280,13 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
- the parser should return an empty string - the parser should return an empty string
""" """
# Check unsuccessful parsing # Check unsuccessful parsing
self.httpx_mock.add_response( httpx_mock.add_response(
json={"Content-Type": "text/html", "X-TIKA:Parsed-By": []}, json={"Content-Type": "text/html", "X-TIKA:Parsed-By": []},
) )
parsed = self.parser.tika_parse("None") parsed = mail_parser.tika_parse("None")
self.assertEqual("", parsed) assert parsed == ""
def test_tika_parse(self): def test_tika_parse(self, httpx_mock: HTTPXMock, mail_parser: MailDocumentParser):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -332,18 +298,22 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>' html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
expected_text = "Some Text" expected_text = "Some Text"
self.httpx_mock.add_response( httpx_mock.add_response(
json={ json={
"Content-Type": "text/html", "Content-Type": "text/html",
"X-TIKA:Parsed-By": [], "X-TIKA:Parsed-By": [],
"X-TIKA:content": expected_text, "X-TIKA:content": expected_text,
}, },
) )
parsed = self.parser.tika_parse(html) parsed = mail_parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip()) assert expected_text == parsed.strip()
self.assertIn("http://localhost:9998", str(self.httpx_mock.get_request().url)) assert "http://localhost:9998" in str(httpx_mock.get_request().url)
def test_tika_parse_exception(self): def test_tika_parse_exception(
self,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -354,11 +324,16 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
""" """
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>' html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR) httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
self.assertRaises(ParseError, self.parser.tika_parse, html) with pytest.raises(ParseError):
mail_parser.tika_parse(html)
def test_tika_parse_unreachable(self): def test_tika_parse_unreachable(
self,
settings: SettingsWrapper,
mail_parser: MailDocumentParser,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -370,30 +345,18 @@ class TestTikaHtmlParse(HttpxMockMixin, BaseMailParserTestCase):
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>' html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
# Check if exception is raised when Tika cannot be reached. # Check if exception is raised when Tika cannot be reached.
self.parser.tika_server = "" with pytest.raises(ParseError):
self.assertRaises(ParseError, self.parser.tika_parse, html) settings.TIKA_ENDPOINT = "http://does-not-exist:9998"
mail_parser.tika_parse(html)
class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase): class TestParser:
def test_parse_no_file(self): def test_parse_eml_simple(
""" self,
GIVEN: mocker: MockerFixture,
- Fresh start mail_parser: MailDocumentParser,
WHEN: simple_txt_email_file: Path,
- parsing is attempted with nonexistent file ):
THEN:
- Exception is thrown
"""
# Check if exception is raised when parsing fails.
self.assertRaises(
ParseError,
self.parser.parse,
self.SAMPLE_DIR / "na.eml",
"message/rfc822",
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_parse_eml_simple(self, mock_generate_pdf: mock.MagicMock):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -403,11 +366,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
- parsed information is available - parsed information is available
""" """
# Validate parsing returns the expected results # Validate parsing returns the expected results
mock_generate_pdf = mocker.patch(
self.parser.parse( "paperless_mail.parsers.MailDocumentParser.generate_pdf",
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
) )
mail_parser.parse(simple_txt_email_file, "message/rfc822")
text_expected = ( text_expected = (
"Subject: Simple Text Mail\n\n" "Subject: Simple Text Mail\n\n"
"From: Some One <mail@someserver.de>\n\n" "From: Some One <mail@someserver.de>\n\n"
@ -416,8 +379,8 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
"BCC: fdf@fvf.de\n\n" "BCC: fdf@fvf.de\n\n"
"\n\nThis is just a simple Text Mail." "\n\nThis is just a simple Text Mail."
) )
self.assertEqual(text_expected, self.parser.text) assert text_expected == mail_parser.text
self.assertEqual( assert (
datetime.datetime( datetime.datetime(
2022, 2022,
10, 10,
@ -426,15 +389,20 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
40, 40,
43, 43,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)), tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
), )
self.parser.date, == mail_parser.date
) )
# Just check if tried to generate archive, the unittest for generate_pdf() goes deeper. # Just check if tried to generate archive, the unittest for generate_pdf() goes deeper.
mock_generate_pdf.assert_called() mock_generate_pdf.assert_called()
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") def test_parse_eml_html(
def test_parse_eml_html(self, mock_generate_pdf: mock.MagicMock): self,
mocker: MockerFixture,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
html_email_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -443,6 +411,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
THEN: THEN:
- Tika is called, parsed information from non html parts is available - Tika is called, parsed information from non html parts is available
""" """
mock_generate_pdf = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf",
)
# Validate parsing returns the expected results # Validate parsing returns the expected results
text_expected = ( text_expected = (
"Subject: HTML Message\n\n" "Subject: HTML Message\n\n"
@ -453,7 +426,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
"Some Text and an embedded image." "Some Text and an embedded image."
) )
self.httpx_mock.add_response( httpx_mock.add_response(
json={ json={
"Content-Type": "text/html", "Content-Type": "text/html",
"X-TIKA:Parsed-By": [], "X-TIKA:Parsed-By": [],
@ -461,11 +434,11 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
}, },
) )
self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822") mail_parser.parse(html_email_file, "message/rfc822")
mock_generate_pdf.assert_called_once() mock_generate_pdf.assert_called_once()
self.assertEqual(text_expected, self.parser.text) assert text_expected == mail_parser.text
self.assertEqual( assert (
datetime.datetime( datetime.datetime(
2022, 2022,
10, 10,
@ -474,11 +447,16 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
23, 23,
19, 19,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)), tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
), )
self.parser.date, == mail_parser.date
) )
def test_generate_pdf_parse_error(self): def test_generate_pdf_parse_error(
self,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
simple_txt_email_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -487,16 +465,18 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
THEN: THEN:
- a ParseError Exception is thrown - a ParseError Exception is thrown
""" """
self.httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR) httpx_mock.add_response(status_code=httpx.codes.INTERNAL_SERVER_ERROR)
self.assertRaises( with pytest.raises(ParseError):
ParseError, mail_parser.parse(simple_txt_email_file, "message/rfc822")
self.parser.parse,
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
)
def test_generate_pdf_simple_email(self): def test_generate_pdf_simple_email(
self,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
simple_txt_email_file: Path,
simple_txt_email_pdf_file: Path,
):
""" """
GIVEN: GIVEN:
- Simple text email with no HTML content - Simple text email with no HTML content
@ -507,17 +487,23 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
- Archive file is generated - Archive file is generated
""" """
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html", url="http://localhost:3000/forms/chromium/convert/html",
method="POST", method="POST",
content=(self.SAMPLE_DIR / "simple_text.eml.pdf").read_bytes(), content=simple_txt_email_pdf_file.read_bytes(),
) )
self.parser.parse(self.SAMPLE_DIR / "simple_text.eml", "message/rfc822") mail_parser.parse(simple_txt_email_file, "message/rfc822")
self.assertIsNotNone(self.parser.archive_path) assert mail_parser.archive_path is not None
def test_generate_pdf_html_email(self): def test_generate_pdf_html_email(
self,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
html_email_file: Path,
html_email_pdf_file: Path,
):
""" """
GIVEN: GIVEN:
- email with HTML content - email with HTML content
@ -528,7 +514,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
- Gotenberg is used to merge the two PDFs - Gotenberg is used to merge the two PDFs
- Archive file is generated - Archive file is generated
""" """
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:9998/tika/text", url="http://localhost:9998/tika/text",
method="PUT", method="PUT",
json={ json={
@ -537,21 +523,27 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
"X-TIKA:content": "This is some Tika HTML text", "X-TIKA:content": "This is some Tika HTML text",
}, },
) )
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html", url="http://localhost:3000/forms/chromium/convert/html",
method="POST", method="POST",
content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(), content=html_email_pdf_file.read_bytes(),
) )
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:3000/forms/pdfengines/merge", url="http://localhost:3000/forms/pdfengines/merge",
method="POST", method="POST",
content=b"Pretend merged PDF content", content=b"Pretend merged PDF content",
) )
self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822") mail_parser.parse(html_email_file, "message/rfc822")
self.assertIsNotNone(self.parser.archive_path) assert mail_parser.archive_path is not None
def test_generate_pdf_html_email_html_to_pdf_failure(self): def test_generate_pdf_html_email_html_to_pdf_failure(
self,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
html_email_file: Path,
html_email_pdf_file: Path,
):
""" """
GIVEN: GIVEN:
- email with HTML content - email with HTML content
@ -561,7 +553,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
THEN: THEN:
- ParseError is raised - ParseError is raised
""" """
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:9998/tika/text", url="http://localhost:9998/tika/text",
method="PUT", method="PUT",
json={ json={
@ -570,20 +562,26 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
"X-TIKA:content": "This is some Tika HTML text", "X-TIKA:content": "This is some Tika HTML text",
}, },
) )
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html", url="http://localhost:3000/forms/chromium/convert/html",
method="POST", method="POST",
content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(), content=html_email_pdf_file.read_bytes(),
) )
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html", url="http://localhost:3000/forms/chromium/convert/html",
method="POST", method="POST",
status_code=httpx.codes.INTERNAL_SERVER_ERROR, status_code=httpx.codes.INTERNAL_SERVER_ERROR,
) )
with self.assertRaises(ParseError): with pytest.raises(ParseError):
self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822") mail_parser.parse(html_email_file, "message/rfc822")
def test_generate_pdf_html_email_merge_failure(self): def test_generate_pdf_html_email_merge_failure(
self,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
html_email_file: Path,
html_email_pdf_file: Path,
):
""" """
GIVEN: GIVEN:
- email with HTML content - email with HTML content
@ -593,7 +591,7 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
THEN: THEN:
- ParseError is raised - ParseError is raised
""" """
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:9998/tika/text", url="http://localhost:9998/tika/text",
method="PUT", method="PUT",
json={ json={
@ -602,20 +600,25 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
"X-TIKA:content": "This is some Tika HTML text", "X-TIKA:content": "This is some Tika HTML text",
}, },
) )
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:3000/forms/chromium/convert/html", url="http://localhost:3000/forms/chromium/convert/html",
method="POST", method="POST",
content=(self.SAMPLE_DIR / "html.eml.pdf").read_bytes(), content=html_email_pdf_file.read_bytes(),
) )
self.httpx_mock.add_response( httpx_mock.add_response(
url="http://localhost:3000/forms/pdfengines/merge", url="http://localhost:3000/forms/pdfengines/merge",
method="POST", method="POST",
status_code=httpx.codes.INTERNAL_SERVER_ERROR, status_code=httpx.codes.INTERNAL_SERVER_ERROR,
) )
with self.assertRaises(ParseError): with pytest.raises(ParseError):
self.parser.parse(self.SAMPLE_DIR / "html.eml", "message/rfc822") mail_parser.parse(html_email_file, "message/rfc822")
def test_mail_to_html(self): def test_mail_to_html(
self,
mail_parser: MailDocumentParser,
html_email_file: Path,
html_email_html_file: Path,
):
""" """
GIVEN: GIVEN:
- Email message with HTML content - Email message with HTML content
@ -624,14 +627,19 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
THEN: THEN:
- Resulting HTML is as expected - Resulting HTML is as expected
""" """
mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml") mail = mail_parser.parse_file_to_message(html_email_file)
html_file = self.parser.mail_to_html(mail) html_file = mail_parser.mail_to_html(mail)
expected_html_file = self.SAMPLE_DIR / "html.eml.html"
self.assertHTMLEqual(expected_html_file.read_text(), html_file.read_text()) expected_html = parse_html(html_email_html_file.read_text())
actual_html = parse_html(html_file.read_text())
assert expected_html == actual_html
def test_generate_pdf_from_mail( def test_generate_pdf_from_mail(
self, self,
httpx_mock: HTTPXMock,
mail_parser: MailDocumentParser,
html_email_file: Path,
): ):
""" """
GIVEN: GIVEN:
@ -642,16 +650,13 @@ class TestParser(FileSystemAssertsMixin, HttpxMockMixin, BaseMailParserTestCase)
- Gotenberg is used to convert HTML to PDF - Gotenberg is used to convert HTML to PDF
""" """
self.httpx_mock.add_response(content=b"Content") httpx_mock.add_response(content=b"Content")
mail = self.parser.parse_file_to_message(self.SAMPLE_DIR / "html.eml") mail = mail_parser.parse_file_to_message(html_email_file)
retval = self.parser.generate_pdf_from_mail(mail) retval = mail_parser.generate_pdf_from_mail(mail)
self.assertEqual(b"Content", retval.read_bytes()) assert retval.read_bytes() == b"Content"
request = self.httpx_mock.get_request() request = httpx_mock.get_request()
self.assertEqual( assert str(request.url) == "http://localhost:3000/forms/chromium/convert/html"
str(request.url),
"http://localhost:3000/forms/chromium/convert/html",
)

View File

@ -3,17 +3,15 @@ import shutil
import subprocess import subprocess
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from unittest import mock
import httpx import httpx
import pytest import pytest
from django.test import TestCase
from imagehash import average_hash from imagehash import average_hash
from PIL import Image from PIL import Image
from pytest_mock import MockerFixture
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import util_call_with_backoff from documents.tests.utils import util_call_with_backoff
from paperless_mail.tests.test_parsers import BaseMailParserTestCase from paperless_mail.parsers import MailDocumentParser
def extract_text(pdf_path: Path) -> str: def extract_text(pdf_path: Path) -> str:
@ -50,7 +48,7 @@ class MailAttachmentMock:
"PAPERLESS_CI_TEST" not in os.environ, "PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with", reason="No Gotenberg/Tika servers to test with",
) )
class TestUrlCanary(TestCase): class TestUrlCanary:
""" """
Verify certain URLs are still available so testing is valid still Verify certain URLs are still available so testing is valid still
""" """
@ -69,13 +67,13 @@ class TestUrlCanary(TestCase):
whether this image stays online forever, so here we check if we can detect if is not whether this image stays online forever, so here we check if we can detect if is not
available anymore. available anymore.
""" """
with self.assertRaises(httpx.HTTPStatusError) as cm: with pytest.raises(httpx.HTTPStatusError) as exec_info:
resp = httpx.get( resp = httpx.get(
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png", "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
) )
resp.raise_for_status() resp.raise_for_status()
self.assertEqual(cm.exception.response.status_code, httpx.codes.NOT_FOUND) assert exec_info.value.response.status_code == httpx.codes.NOT_FOUND
def test_is_online_image_still_available(self): def test_is_online_image_still_available(self):
""" """
@ -100,13 +98,19 @@ class TestUrlCanary(TestCase):
"PAPERLESS_CI_TEST" not in os.environ, "PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with", reason="No Gotenberg/Tika servers to test with",
) )
class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase): class TestParserLive:
@staticmethod @staticmethod
def imagehash(file, hash_size=18): def imagehash(file, hash_size=18):
return f"{average_hash(Image.open(file), hash_size)}" return f"{average_hash(Image.open(file), hash_size)}"
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") def test_get_thumbnail(
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock): self,
mocker: MockerFixture,
mail_parser: MailDocumentParser,
simple_txt_email_file: Path,
simple_txt_email_pdf_file: Path,
simple_txt_email_thumbnail_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -115,22 +119,21 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
THEN: THEN:
- The returned thumbnail image file is as expected - The returned thumbnail image file is as expected
""" """
mock_generate_pdf.return_value = self.SAMPLE_DIR / "simple_text.eml.pdf" mock_generate_pdf = mocker.patch(
thumb = self.parser.get_thumbnail( "paperless_mail.parsers.MailDocumentParser.generate_pdf",
self.SAMPLE_DIR / "simple_text.eml",
"message/rfc822",
) )
self.assertIsFile(thumb) mock_generate_pdf.return_value = simple_txt_email_pdf_file
expected = self.SAMPLE_DIR / "simple_text.eml.pdf.webp" thumb = mail_parser.get_thumbnail(simple_txt_email_file, "message/rfc822")
self.assertEqual( assert thumb.exists()
self.imagehash(thumb), assert thumb.is_file()
self.imagehash(expected),
f"Created Thumbnail {thumb} differs from expected file {expected}",
)
def test_tika_parse_successful(self): assert (
self.imagehash(thumb) == self.imagehash(simple_txt_email_thumbnail_file)
), f"Created Thumbnail {thumb} differs from expected file {simple_txt_email_thumbnail_file}"
def test_tika_parse_successful(self, mail_parser: MailDocumentParser):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -143,15 +146,16 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
expected_text = "Some Text" expected_text = "Some Text"
# Check successful parsing # Check successful parsing
parsed = self.parser.tika_parse(html) parsed = mail_parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip()) assert expected_text == parsed.strip()
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf_gotenberg_merging( def test_generate_pdf_gotenberg_merging(
self, self,
mock_generate_pdf_from_html: mock.MagicMock, mocker: MockerFixture,
mock_generate_pdf_from_mail: mock.MagicMock, mail_parser: MailDocumentParser,
html_email_file: Path,
merged_pdf_first: Path,
merged_pdf_second: Path,
): ):
""" """
GIVEN: GIVEN:
@ -161,61 +165,67 @@ class TestParserLive(FileSystemAssertsMixin, BaseMailParserTestCase):
THEN: THEN:
- gotenberg is called to merge files and the resulting file is returned - gotenberg is called to merge files and the resulting file is returned
""" """
mock_generate_pdf_from_mail.return_value = self.SAMPLE_DIR / "first.pdf" mock_generate_pdf_from_html = mocker.patch(
mock_generate_pdf_from_html.return_value = self.SAMPLE_DIR / "second.pdf" "paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html",
msg = self.parser.parse_file_to_message(
self.SAMPLE_DIR / "html.eml",
) )
mock_generate_pdf_from_mail = mocker.patch(
"paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail",
)
mock_generate_pdf_from_mail.return_value = merged_pdf_first
mock_generate_pdf_from_html.return_value = merged_pdf_second
msg = mail_parser.parse_file_to_message(html_email_file)
_, pdf_path = util_call_with_backoff( _, pdf_path = util_call_with_backoff(
self.parser.generate_pdf, mail_parser.generate_pdf,
[msg], [msg],
) )
self.assertIsFile(pdf_path) assert pdf_path.exists()
assert pdf_path.is_file()
extracted = extract_text(pdf_path) extracted = extract_text(pdf_path)
expected = ( expected = (
"first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c" "first PDF to be merged.\n\x0csecond PDF to be merged.\n\x0c"
) )
self.assertEqual(expected, extracted) assert expected == extracted
def test_generate_pdf_from_mail(self): def test_generate_pdf_from_mail(
self,
mail_parser: MailDocumentParser,
html_email_file: Path,
html_email_pdf_file: Path,
html_email_thumbnail_file: Path,
):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
WHEN: WHEN:
- pdf generation from simple eml file is requested - pdf generation from simple eml file is requested
THEN: THEN:
- gotenberg is called and the resulting file is returned and look as expected. - Gotenberg is called and the resulting file is returned and look as expected.
""" """
util_call_with_backoff( util_call_with_backoff(mail_parser.parse, [html_email_file, "message/rfc822"])
self.parser.parse,
[self.SAMPLE_DIR / "html.eml", "message/rfc822"],
)
# Check the archive PDF # Check the archive PDF
archive_path = self.parser.get_archive_path() archive_path = mail_parser.get_archive_path()
archive_text = extract_text(archive_path) archive_text = extract_text(archive_path)
expected_archive_text = extract_text(self.SAMPLE_DIR / "html.eml.pdf") expected_archive_text = extract_text(html_email_pdf_file)
# Archive includes the HTML content, so use in # Archive includes the HTML content, so use in
self.assertIn(expected_archive_text, archive_text) assert expected_archive_text in archive_text
# Check the thumbnail # Check the thumbnail
generated_thumbnail = self.parser.get_thumbnail( generated_thumbnail = mail_parser.get_thumbnail(
self.SAMPLE_DIR / "html.eml", html_email_file,
"message/rfc822", "message/rfc822",
) )
generated_thumbnail_hash = self.imagehash(generated_thumbnail) generated_thumbnail_hash = self.imagehash(generated_thumbnail)
# The created pdf is not reproducible. But the converted image should always look the same. # The created pdf is not reproducible. But the converted image should always look the same.
expected_hash = self.imagehash(self.SAMPLE_DIR / "html.eml.pdf.webp") expected_hash = self.imagehash(html_email_thumbnail_file)
self.assertEqual( assert (
generated_thumbnail_hash, generated_thumbnail_hash == expected_hash
expected_hash, ), f"PDF looks different. Check if {generated_thumbnail} looks weird."
f"PDF looks different. Check if {generated_thumbnail} looks weird.",
)

View File

@ -1,4 +1,4 @@
import os from pathlib import Path
from django.conf import settings from django.conf import settings
from PIL import Image from PIL import Image
@ -15,7 +15,7 @@ class TextDocumentParser(DocumentParser):
logging_name = "paperless.parsing.text" logging_name = "paperless.parsing.text"
def get_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
text = self.read_file_handle_unicode_errors(document_path) text = self.read_file_handle_unicode_errors(document_path)
img = Image.new("RGB", (500, 700), color="white") img = Image.new("RGB", (500, 700), color="white")
@ -27,7 +27,7 @@ class TextDocumentParser(DocumentParser):
) )
draw.text((5, 5), text, font=font, fill="black") draw.text((5, 5), text, font=font, fill="black")
out_path = os.path.join(self.tempdir, "thumb.webp") out_path = self.tempdir / "thumb.webp"
img.save(out_path, format="WEBP") img.save(out_path, format="WEBP")
return out_path return out_path

View File

@ -0,0 +1,30 @@
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_text.parsers import TextDocumentParser
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture()
def text_parser() -> Generator[TextDocumentParser, None, None]:
try:
parser = TextDocumentParser(logging_group=None)
yield parser
finally:
parser.cleanup()
@pytest.fixture(scope="session")
def sample_txt_file(sample_dir: Path) -> Path:
return sample_dir / "test.txt"
@pytest.fixture(scope="session")
def malformed_txt_file(sample_dir: Path) -> Path:
return sample_dir / "decode_error.txt"

View File

@ -1,37 +1,26 @@
from pathlib import Path from pathlib import Path
from django.test import TestCase
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_text.parsers import TextDocumentParser from paperless_text.parsers import TextDocumentParser
class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): class TestTextParser:
SAMPLE_DIR = Path(__file__).resolve().parent / "samples" def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
def test_thumbnail(self):
parser = TextDocumentParser(None)
# just make sure that it does not crash # just make sure that it does not crash
f = parser.get_thumbnail( f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
self.SAMPLE_DIR / "test.txt", assert f.exists()
"text/plain", assert f.is_file()
)
self.assertIsFile(f)
def test_parse(self): def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
parser = TextDocumentParser(None) text_parser.parse(sample_txt_file, "text/plain")
parser.parse( assert text_parser.get_text() == "This is a test file.\n"
self.SAMPLE_DIR / "test.txt", assert text_parser.get_archive_path() is None
"text/plain",
)
self.assertEqual(parser.get_text(), "This is a test file.\n") def test_parse_invalid_bytes(
self.assertIsNone(parser.get_archive_path()) self,
text_parser: TextDocumentParser,
def test_parse_invalid_bytes(self): malformed_txt_file: Path,
):
""" """
GIVEN: GIVEN:
- Text file which contains invalid UTF bytes - Text file which contains invalid UTF bytes
@ -41,12 +30,8 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- Parsing continues - Parsing continues
- Invalid bytes are removed - Invalid bytes are removed
""" """
parser = TextDocumentParser(None)
parser.parse( text_parser.parse(malformed_txt_file, "text/plain")
self.SAMPLE_DIR / "decode_error.txt",
"text/plain",
)
self.assertEqual(parser.get_text(), "Pantothens<EFBFBD>ure\n") assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
self.assertIsNone(parser.get_archive_path()) assert text_parser.get_archive_path() is None

View File

@ -0,0 +1,40 @@
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_tika.parsers import TikaDocumentParser
@pytest.fixture()
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
try:
parser = TikaDocumentParser(logging_group=None)
yield parser
finally:
parser.cleanup()
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture(scope="session")
def sample_odt_file(sample_dir: Path) -> Path:
return sample_dir / "sample.odt"
@pytest.fixture(scope="session")
def sample_docx_file(sample_dir: Path) -> Path:
return sample_dir / "sample.docx"
@pytest.fixture(scope="session")
def sample_doc_file(sample_dir: Path) -> Path:
return sample_dir / "sample.doc"
@pytest.fixture(scope="session")
def sample_broken_odt(sample_dir: Path) -> Path:
return sample_dir / "multi-part-broken.odt"

View File

@ -1,9 +1,7 @@
import os import os
from pathlib import Path from pathlib import Path
from typing import Final
import pytest import pytest
from django.test import TestCase
from documents.tests.utils import util_call_with_backoff from documents.tests.utils import util_call_with_backoff
from paperless_tika.parsers import TikaDocumentParser from paperless_tika.parsers import TikaDocumentParser
@ -13,22 +11,19 @@ from paperless_tika.parsers import TikaDocumentParser
"PAPERLESS_CI_TEST" not in os.environ, "PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with", reason="No Gotenberg/Tika servers to test with",
) )
class TestTikaParserAgainstServer(TestCase): @pytest.mark.django_db()
class TestTikaParserAgainstServer:
""" """
This test case tests the Tika parsing against a live tika server, This test case tests the Tika parsing against a live tika server,
if the environment contains the correct value indicating such a server if the environment contains the correct value indicating such a server
is available. is available.
""" """
SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve() def test_basic_parse_odt(
self,
def setUp(self) -> None: tika_parser: TikaDocumentParser,
self.parser = TikaDocumentParser(logging_group=None) sample_odt_file: Path,
):
def tearDown(self) -> None:
self.parser.cleanup()
def test_basic_parse_odt(self):
""" """
GIVEN: GIVEN:
- An input ODT format document - An input ODT format document
@ -38,26 +33,26 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct - Document content is correct
- Document date is correct - Document date is correct
""" """
test_file = self.SAMPLE_DIR / Path("sample.odt")
util_call_with_backoff( util_call_with_backoff(
self.parser.parse, tika_parser.parse,
[test_file, "application/vnd.oasis.opendocument.text"], [sample_odt_file, "application/vnd.oasis.opendocument.text"],
) )
self.assertEqual( assert (
self.parser.text, tika_parser.text
"This is an ODT test document, created September 14, 2022", == "This is an ODT test document, created September 14, 2022"
) )
self.assertIsNotNone(self.parser.archive_path) assert tika_parser.archive_path is not None
with open(self.parser.archive_path, "rb") as f: assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
# PDFs begin with the bytes PDF-x.y
self.assertTrue(b"PDF-" in f.read()[:10])
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_docx(self): def test_basic_parse_docx(
self,
tika_parser: TikaDocumentParser,
sample_docx_file: Path,
):
""" """
GIVEN: GIVEN:
- An input DOCX format document - An input DOCX format document
@ -67,27 +62,29 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct - Document content is correct
- Document date is correct - Document date is correct
""" """
test_file = self.SAMPLE_DIR / Path("sample.docx")
util_call_with_backoff( util_call_with_backoff(
self.parser.parse, tika_parser.parse,
[ [
test_file, sample_docx_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
], ],
) )
self.assertEqual( assert (
self.parser.text, tika_parser.text
"This is an DOCX test document, also made September 14, 2022", == "This is an DOCX test document, also made September 14, 2022"
) )
self.assertIsNotNone(self.parser.archive_path) assert tika_parser.archive_path is not None
with open(self.parser.archive_path, "rb") as f: with open(tika_parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10]) assert b"PDF-" in f.read()[:10]
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_doc(self): def test_basic_parse_doc(
self,
tika_parser: TikaDocumentParser,
sample_doc_file: Path,
):
""" """
GIVEN: GIVEN:
- An input DOC format document - An input DOC format document
@ -97,22 +94,24 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct - Document content is correct
- Document date is correct - Document date is correct
""" """
test_file = self.SAMPLE_DIR / "sample.doc"
util_call_with_backoff( util_call_with_backoff(
self.parser.parse, tika_parser.parse,
[test_file, "application/msword"], [sample_doc_file, "application/msword"],
) )
self.assertIn( assert (
"his is a test document, saved in the older .doc format", "This is a test document, saved in the older .doc format"
self.parser.text, in tika_parser.text
) )
self.assertIsNotNone(self.parser.archive_path) assert tika_parser.archive_path is not None
with open(self.parser.archive_path, "rb") as f: with open(tika_parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10]) assert b"PDF-" in f.read()[:10]
def test_tika_fails_multi_part(self): def test_tika_fails_multi_part(
self,
tika_parser: TikaDocumentParser,
sample_broken_odt: Path,
):
""" """
GIVEN: GIVEN:
- An input ODT format document - An input ODT format document
@ -125,13 +124,11 @@ class TestTikaParserAgainstServer(TestCase):
See also: See also:
- https://issues.apache.org/jira/browse/TIKA-4110 - https://issues.apache.org/jira/browse/TIKA-4110
""" """
test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
util_call_with_backoff( util_call_with_backoff(
self.parser.parse, tika_parser.parse,
[test_file, "application/vnd.oasis.opendocument.text"], [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
) )
self.assertIsNotNone(self.parser.archive_path) assert tika_parser.archive_path is not None
with open(self.parser.archive_path, "rb") as f: with open(tika_parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10]) assert b"PDF-" in f.read()[:10]

View File

@ -1,30 +1,30 @@
import datetime import datetime
import os
import zoneinfo import zoneinfo
from http import HTTPStatus
from pathlib import Path from pathlib import Path
from django.test import TestCase import pytest
from django.test import override_settings
from httpx import codes from httpx import codes
from httpx._multipart import DataField from httpx._multipart import DataField
from rest_framework import status from pytest_django.fixtures import SettingsWrapper
from pytest_httpx import HTTPXMock
from documents.parsers import ParseError from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser from paperless_tika.parsers import TikaDocumentParser
from paperless_tika.tests.utils import HttpxMockMixin
class TestTikaParser(HttpxMockMixin, TestCase): @pytest.mark.django_db()
def setUp(self) -> None: class TestTikaParser:
self.parser = TikaDocumentParser(logging_group=None) def test_parse(
self,
def tearDown(self) -> None: httpx_mock: HTTPXMock,
self.parser.cleanup() settings: SettingsWrapper,
tika_parser: TikaDocumentParser,
@override_settings(TIME_ZONE="America/Chicago") sample_odt_file: Path,
def test_parse(self): ):
settings.TIME_ZONE = "America/Chicago"
# Pretend parse response # Pretend parse response
self.httpx_mock.add_response( httpx_mock.add_response(
json={ json={
"Content-Type": "application/vnd.oasis.opendocument.text", "Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [], "X-TIKA:Parsed-By": [],
@ -33,30 +33,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
}, },
) )
# Pretend convert to PDF response # Pretend convert to PDF response
self.httpx_mock.add_response(content=b"PDF document") httpx_mock.add_response(content=b"PDF document")
file = Path(os.path.join(self.parser.tempdir, "input.odt")) tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
file.touch()
self.parser.parse(file, "application/vnd.oasis.opendocument.text") assert tika_parser.text == "the content"
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert f.read() == b"PDF document"
self.assertEqual(self.parser.text, "the content") assert tika_parser.date == datetime.datetime(
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertEqual(f.read(), b"PDF document")
self.assertEqual(
self.parser.date,
datetime.datetime(
2020, 2020,
11, 11,
21, 21,
tzinfo=zoneinfo.ZoneInfo("America/Chicago"), tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
),
) )
def test_metadata(self): def test_metadata(
self.httpx_mock.add_response( self,
httpx_mock: HTTPXMock,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
httpx_mock.add_response(
json={ json={
"Content-Type": "application/vnd.oasis.opendocument.text", "Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [], "X-TIKA:Parsed-By": [],
@ -65,18 +64,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
}, },
) )
file = Path(os.path.join(self.parser.tempdir, "input.odt")) metadata = tika_parser.extract_metadata(
file.touch() sample_odt_file,
metadata = self.parser.extract_metadata(
file,
"application/vnd.oasis.opendocument.text", "application/vnd.oasis.opendocument.text",
) )
self.assertTrue("dcterms:created" in [m["key"] for m in metadata]) assert "dcterms:created" in [m["key"] for m in metadata]
self.assertTrue("Some-key" in [m["key"] for m in metadata]) assert "Some-key" in [m["key"] for m in metadata]
def test_convert_failure(self): def test_convert_failure(
self,
httpx_mock: HTTPXMock,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
""" """
GIVEN: GIVEN:
- Document needs to be converted to PDF - Document needs to be converted to PDF
@ -86,15 +87,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
- Parse error is raised - Parse error is raised
""" """
# Pretend convert to PDF response # Pretend convert to PDF response
self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
file = Path(os.path.join(self.parser.tempdir, "input.odt")) with pytest.raises(ParseError):
file.touch() tika_parser.convert_to_pdf(sample_odt_file, None)
with self.assertRaises(ParseError): @pytest.mark.parametrize(
self.parser.convert_to_pdf(file, None) ("setting_value", "expected_form_value"),
[
def test_request_pdf_a_format(self): ("pdfa", "PDF/A-2b"),
("pdfa-1", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"),
("pdfa-3", "PDF/A-3b"),
],
)
def test_request_pdf_a_format(
self,
setting_value: str,
expected_form_value: str,
httpx_mock: HTTPXMock,
settings: SettingsWrapper,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
""" """
GIVEN: GIVEN:
- Document needs to be converted to PDF - Document needs to be converted to PDF
@ -103,31 +118,21 @@ class TestTikaParser(HttpxMockMixin, TestCase):
THEN: THEN:
- Request to Gotenberg contains the expected PDF/A format string - Request to Gotenberg contains the expected PDF/A format string
""" """
file = Path(os.path.join(self.parser.tempdir, "input.odt")) settings.OCR_OUTPUT_TYPE = setting_value
file.touch() httpx_mock.add_response(
for setting, expected_key in [
("pdfa", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"),
("pdfa-1", "PDF/A-2b"),
("pdfa-3", "PDF/A-3b"),
]:
with override_settings(OCR_OUTPUT_TYPE=setting):
self.httpx_mock.add_response(
status_code=codes.OK, status_code=codes.OK,
content=b"PDF document", content=b"PDF document",
method="POST", method="POST",
) )
self.parser.convert_to_pdf(file, None) tika_parser.convert_to_pdf(sample_odt_file, None)
request = self.httpx_mock.get_request() request = httpx_mock.get_request()
found = False found = False
for field in request.stream.fields: for field in request.stream.fields:
if isinstance(field, DataField) and field.name == "pdfa": if isinstance(field, DataField) and field.name == "pdfa":
self.assertEqual(field.value, expected_key) assert field.value == expected_form_value
found = True found = True
break assert found, "pdfFormat was not found"
self.assertTrue(found)
self.httpx_mock.reset(assert_all_responses_were_requested=False) httpx_mock.reset(assert_all_responses_were_requested=False)

View File

@ -1,11 +0,0 @@
import pytest
from pytest_httpx import HTTPXMock
class HttpxMockMixin:
@pytest.fixture(autouse=True)
def httpx_mock_auto(self, httpx_mock: HTTPXMock):
"""
Workaround for allowing use of a fixture with unittest style testing
"""
self.httpx_mock = httpx_mock