mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-23 12:58:18 -05:00
517 lines
15 KiB
Python
517 lines
15 KiB
Python
import datetime
|
|
from zoneinfo import ZoneInfo
|
|
|
|
from pytest_django.fixtures import SettingsWrapper
|
|
|
|
from documents.parsers import parse_date
|
|
from documents.parsers import parse_date_generator
|
|
|
|
|
|
class TestDate:
|
|
def test_date_format_1(self):
|
|
text = "lorem ipsum 130218 lorem ipsum"
|
|
assert parse_date("", text) is None
|
|
|
|
def test_date_format_2(self):
|
|
text = "lorem ipsum 2018 lorem ipsum"
|
|
assert parse_date("", text) is None
|
|
|
|
def test_date_format_3(self):
|
|
text = "lorem ipsum 20180213 lorem ipsum"
|
|
assert parse_date("", text) is None
|
|
|
|
def test_date_format_4(self, settings_timezone: ZoneInfo):
|
|
text = "lorem ipsum 13.02.2018 lorem ipsum"
|
|
date = parse_date("", text)
|
|
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
|
|
|
def test_date_format_5(self, settings_timezone: ZoneInfo):
|
|
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
|
|
date = parse_date("", text)
|
|
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
|
|
|
def test_date_format_6(self):
|
|
text = (
|
|
"lorem ipsum\n"
|
|
"Wohnort\n"
|
|
"3100\n"
|
|
"IBAN\n"
|
|
"AT87 4534\n"
|
|
"1234\n"
|
|
"1234 5678\n"
|
|
"BIC\n"
|
|
"lorem ipsum"
|
|
)
|
|
assert parse_date("", text) is None
|
|
|
|
def test_date_format_7(self, settings_timezone: ZoneInfo):
|
|
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
|
date = parse_date("", text)
|
|
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
|
|
|
def test_date_format_8(self, settings_timezone: ZoneInfo):
|
|
text = (
|
|
"lorem ipsum\n"
|
|
"Wohnort\n"
|
|
"3100\n"
|
|
"IBAN\n"
|
|
"AT87 4534\n"
|
|
"1234\n"
|
|
"1234 5678\n"
|
|
"BIC\n"
|
|
"lorem ipsum\n"
|
|
"März 2020"
|
|
)
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2020,
|
|
3,
|
|
1,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_9(self, settings_timezone: ZoneInfo):
|
|
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2020,
|
|
3,
|
|
1,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_10(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
22,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_11(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
22,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_12(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
22,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_13(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
22,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_14(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
22,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_15(self):
|
|
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
|
|
assert parse_date("", text) is None
|
|
|
|
def test_date_format_16(self):
|
|
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
|
|
assert parse_date("", text) is None
|
|
|
|
def test_date_format_17(self):
|
|
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
|
|
assert parse_date("", text) is None
|
|
|
|
def test_date_format_18(self):
|
|
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
|
|
assert parse_date("", text) is None
|
|
|
|
def test_date_format_19(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
21,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_20(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
22,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_21(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
2,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_22(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
23,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_23(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
24,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_24(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
21,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_25(self, settings_timezone: ZoneInfo):
|
|
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2022,
|
|
3,
|
|
25,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_date_format_26(self, settings_timezone: ZoneInfo):
|
|
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2019,
|
|
9,
|
|
25,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_crazy_date_past(self):
|
|
assert parse_date("", "01-07-0590 00:00:00") is None
|
|
|
|
def test_crazy_date_future(self):
|
|
assert parse_date("", "01-07-2350 00:00:00") is None
|
|
|
|
def test_crazy_date_with_spaces(self):
|
|
assert parse_date("", "20 408000l 2475") is None
|
|
|
|
def test_utf_month_names(self, settings_timezone: ZoneInfo):
|
|
assert parse_date("", "13 décembre 2023") == datetime.datetime(
|
|
2023,
|
|
12,
|
|
13,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "13 août 2022") == datetime.datetime(
|
|
2022,
|
|
8,
|
|
13,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "11 März 2020") == datetime.datetime(
|
|
2020,
|
|
3,
|
|
11,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
|
|
2018,
|
|
3,
|
|
17,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "1. veljače 2016.") == datetime.datetime(
|
|
2016,
|
|
2,
|
|
1,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "15. února 1985") == datetime.datetime(
|
|
1985,
|
|
2,
|
|
15,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "30. září 2011") == datetime.datetime(
|
|
2011,
|
|
9,
|
|
30,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "28. května 1990") == datetime.datetime(
|
|
1990,
|
|
5,
|
|
28,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "1. grudzień 1997") == datetime.datetime(
|
|
1997,
|
|
12,
|
|
1,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "17 Şubat 2024") == datetime.datetime(
|
|
2024,
|
|
2,
|
|
17,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
|
|
2012,
|
|
8,
|
|
30,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "17 Eylül 2000") == datetime.datetime(
|
|
2000,
|
|
9,
|
|
17,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
assert parse_date("", "5. október 1992") == datetime.datetime(
|
|
1992,
|
|
10,
|
|
5,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_multiple_dates(self, settings_timezone: ZoneInfo):
|
|
text = """This text has multiple dates.
|
|
For example 02.02.2018, 22 July 2022 and December 2021.
|
|
But not 24-12-9999 because it's in the future..."""
|
|
dates = list(parse_date_generator("", text))
|
|
|
|
assert dates == [
|
|
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
|
|
datetime.datetime(
|
|
2022,
|
|
7,
|
|
22,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
),
|
|
datetime.datetime(
|
|
2021,
|
|
12,
|
|
1,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
),
|
|
]
|
|
|
|
def test_filename_date_parse_valid_ymd(
|
|
self,
|
|
settings: SettingsWrapper,
|
|
settings_timezone: ZoneInfo,
|
|
):
|
|
"""
|
|
GIVEN:
|
|
- Date parsing from the filename is enabled
|
|
- Filename date format is with Year Month Day (YMD)
|
|
- Filename contains date matching the format
|
|
|
|
THEN:
|
|
- Should parse the date from the filename
|
|
"""
|
|
settings.FILENAME_DATE_ORDER = "YMD"
|
|
|
|
assert parse_date(
|
|
"/tmp/Scan-2022-04-01.pdf",
|
|
"No date in here",
|
|
) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
|
|
|
|
def test_filename_date_parse_valid_dmy(
|
|
self,
|
|
settings: SettingsWrapper,
|
|
settings_timezone: ZoneInfo,
|
|
):
|
|
"""
|
|
GIVEN:
|
|
- Date parsing from the filename is enabled
|
|
- Filename date format is with Day Month Year (DMY)
|
|
- Filename contains date matching the format
|
|
|
|
THEN:
|
|
- Should parse the date from the filename
|
|
"""
|
|
settings.FILENAME_DATE_ORDER = "DMY"
|
|
assert parse_date(
|
|
"/tmp/Scan-10.01.2021.pdf",
|
|
"No date in here",
|
|
) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
|
|
|
|
def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
|
|
"""
|
|
GIVEN:
|
|
- Date parsing from the filename is enabled
|
|
- Filename includes no date
|
|
- File content includes no date
|
|
|
|
THEN:
|
|
- No date is parsed
|
|
"""
|
|
settings.FILENAME_DATE_ORDER = "YMD"
|
|
assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
|
|
|
|
def test_filename_date_ignored_use_content(
|
|
self,
|
|
settings: SettingsWrapper,
|
|
settings_timezone: ZoneInfo,
|
|
):
|
|
"""
|
|
GIVEN:
|
|
- Date parsing from the filename is enabled
|
|
- Filename date format is with Day Month Year (YMD)
|
|
- Date order is Day Month Year (DMY, the default)
|
|
- Filename contains date matching the format
|
|
- Filename date is an ignored date
|
|
- File content includes a date
|
|
|
|
THEN:
|
|
- Should parse the date from the content not filename
|
|
"""
|
|
settings.FILENAME_DATE_ORDER = "YMD"
|
|
settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
|
|
assert parse_date(
|
|
"/tmp/Scan-2022-04-01.pdf",
|
|
"The matching date is 24.03.2022",
|
|
) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
|
|
|
|
def test_ignored_dates_default_order(
|
|
self,
|
|
settings: SettingsWrapper,
|
|
settings_timezone: ZoneInfo,
|
|
):
|
|
"""
|
|
GIVEN:
|
|
- Ignore dates have been set
|
|
- File content includes ignored dates
|
|
- File content includes 1 non-ignored date
|
|
|
|
THEN:
|
|
- Should parse the date non-ignored date from content
|
|
"""
|
|
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
|
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2018,
|
|
2,
|
|
13,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|
|
|
|
def test_ignored_dates_order_ymd(
|
|
self,
|
|
settings: SettingsWrapper,
|
|
settings_timezone: ZoneInfo,
|
|
):
|
|
"""
|
|
GIVEN:
|
|
- Ignore dates have been set
|
|
- Date order is Year Month Date (YMD)
|
|
- File content includes ignored dates
|
|
- File content includes 1 non-ignored date
|
|
|
|
THEN:
|
|
- Should parse the date non-ignored date from content
|
|
"""
|
|
|
|
settings.FILENAME_DATE_ORDER = "YMD"
|
|
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
|
|
|
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
|
|
|
|
assert parse_date("", text) == datetime.datetime(
|
|
2018,
|
|
2,
|
|
13,
|
|
0,
|
|
0,
|
|
tzinfo=settings_timezone,
|
|
)
|