paperless-ngx/src/documents/tests/test_date_parsing.py
2023-07-30 08:34:33 -07:00

341 lines
12 KiB
Python

import datetime
import os
import shutil
from uuid import uuid4
from dateutil import tz
from django.conf import settings
from django.test import TestCase
from django.test import override_settings
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
class TestDate(TestCase):
SAMPLE_FILES = os.path.join(
os.path.dirname(__file__),
"../../paperless_tesseract/tests/samples",
)
SCRATCH = f"/tmp/paperless-tests-{str(uuid4())[:8]}"
def setUp(self):
os.makedirs(self.SCRATCH, exist_ok=True)
def tearDown(self):
shutil.rmtree(self.SCRATCH)
def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum"
self.assertEqual(parse_date("", text), None)
def test_date_format_2(self):
text = "lorem ipsum 2018 lorem ipsum"
self.assertEqual(parse_date("", text), None)
def test_date_format_3(self):
text = "lorem ipsum 20180213 lorem ipsum"
self.assertEqual(parse_date("", text), None)
def test_date_format_4(self):
text = "lorem ipsum 13.02.2018 lorem ipsum"
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_5(self):
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_6(self):
text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum"
)
self.assertEqual(parse_date("", text), None)
def test_date_format_7(self):
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text)
self.assertEqual(
date,
datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_8(self):
text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum\n"
"März 2020"
)
self.assertEqual(
parse_date("", text),
datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(SCRATCH_DIR=SCRATCH)
def test_date_format_9(self):
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
self.assertEqual(
parse_date("", text),
datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_10(self):
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_11(self):
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_12(self):
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_13(self):
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_14(self):
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_15(self):
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None)
def test_date_format_16(self):
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None)
def test_date_format_17(self):
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None)
def test_date_format_18(self):
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None)
def test_date_format_19(self):
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_20(self):
text = "Customer Number Currency 22nd MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_21(self):
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_22(self):
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_23(self):
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_24(self):
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_25(self):
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_crazy_date_past(self, *args):
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
def test_crazy_date_future(self, *args):
self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
def test_crazy_date_with_spaces(self, *args):
self.assertIsNone(parse_date("", "20 408000l 2475"))
def test_multiple_dates(self):
text = """This text has multiple dates.
For example 02.02.2018, 22 July 2022 and Dezember 2021.
But not 24-12-9999 because its in the future..."""
dates = list(parse_date_generator("", text))
self.assertEqual(len(dates), 3)
self.assertEqual(
dates[0],
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[1],
datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[2],
datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_valid_ymd(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Year Month Day (YMD)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
self.assertEqual(
parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"),
datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="DMY")
def test_filename_date_parse_valid_dmy(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (DMY)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
self.assertEqual(
parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"),
datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_invalid(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename includes no date
- File content includes no date
THEN:
- No date is parsed
"""
self.assertIsNone(
parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"),
)
@override_settings(
FILENAME_DATE_ORDER="YMD",
IGNORE_DATES=(datetime.date(2022, 4, 1),),
)
def test_filename_date_ignored_use_content(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (YMD)
- Date order is Day Month Year (DMY, the default)
- Filename contains date matching the format
- Filename date is an ignored date
- File content includes a date
THEN:
- Should parse the date from the content not filename
"""
self.assertEqual(
parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"),
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
)
def test_ignored_dates_default_order(self, *args):
"""
GIVEN:
- Ignore dates have been set
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
self.assertEqual(
parse_date("", text),
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
DATE_ORDER="YMD",
)
def test_ignored_dates_order_ymd(self, *args):
"""
GIVEN:
- Ignore dates have been set
- Date order is Year Month Date (YMD)
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
self.assertEqual(
parse_date("", text),
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)