mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Add support for more date formats
This commit is contained in:
parent
6ad3d45d60
commit
157240351f
@ -32,16 +32,18 @@ from documents.utils import copy_file_with_basic_stats
|
|||||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||||
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
|
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
|
||||||
|
# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
|
||||||
|
|
||||||
# TODO: isnt there a date parsing library for this?
|
# TODO: isnt there a date parsing library for this?
|
||||||
|
|
||||||
DATE_REGEX = re.compile(
|
DATE_REGEX = re.compile(
|
||||||
r"(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|" # noqa: E501
|
r"(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|" # noqa: E501
|
||||||
r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|" # noqa: E501
|
r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|" # noqa: E501
|
||||||
r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # noqa: E501
|
r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[a-zA-Z]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # noqa: E501
|
||||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|"
|
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|"
|
||||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|"
|
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|"
|
||||||
r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][A-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))", # noqa: E501
|
r"(\b|(?!=([_-])))([0-9]{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-][0-9]{4})(\b|(?=([_-])))|" # noqa: E501
|
||||||
|
r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][a-zA-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))", # noqa: E501
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -288,6 +290,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
|||||||
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
|
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
|
||||||
for m in re.finditer(DATE_REGEX, content):
|
for m in re.finditer(DATE_REGEX, content):
|
||||||
date = __process_match(m, date_order)
|
date = __process_match(m, date_order)
|
||||||
|
print(date)
|
||||||
if date is not None:
|
if date is not None:
|
||||||
yield date
|
yield date
|
||||||
|
|
||||||
|
@ -152,6 +152,55 @@ class TestDate(TestCase):
|
|||||||
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
|
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
|
||||||
self.assertIsNone(parse_date("", text), None)
|
self.assertIsNone(parse_date("", text), None)
|
||||||
|
|
||||||
|
def test_date_format_19(self):
|
||||||
|
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("", text),
|
||||||
|
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_date_format_20(self):
|
||||||
|
text = "Customer Number Currency 22nd MAR 2022 Credit Card 1934829304"
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("", text),
|
||||||
|
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_date_format_21(self):
|
||||||
|
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("", text),
|
||||||
|
datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_date_format_22(self):
|
||||||
|
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("", text),
|
||||||
|
datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_date_format_23(self):
|
||||||
|
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("", text),
|
||||||
|
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_date_format_24(self):
|
||||||
|
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("", text),
|
||||||
|
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_date_format_25(self):
|
||||||
|
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
|
||||||
|
self.assertEqual(
|
||||||
|
parse_date("", text),
|
||||||
|
datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||||
|
)
|
||||||
|
|
||||||
def test_crazy_date_past(self, *args):
|
def test_crazy_date_past(self, *args):
|
||||||
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
|
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user