From 1ecb26a3fb487131947ac1b246eed69e9d9fadb9 Mon Sep 17 00:00:00 2001 From: fantasticle <84633558+fantasticle@users.noreply.github.com> Date: Wed, 30 Mar 2022 12:19:30 +0200 Subject: [PATCH 1/7] Update regex date match patterns --- src/documents/parsers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index cb70f4fc6..ca24026fb 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -23,6 +23,7 @@ from documents.signals import document_consumer_declaration # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits +# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits, MONTH is 3 letters, e.g. 22-FEB-2022 # TODO: isnt there a date parsing library for this? @@ -31,7 +32,8 @@ DATE_REGEX = re.compile( r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|" # noqa: E501 r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # noqa: E501 r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))", + r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))([0-9]{1,2}[ \.\/-][A-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))|" ) From d8261b33592932d3c82070bbaac78d87d6c9790f Mon Sep 17 00:00:00 2001 From: fantasticle <84633558+fantasticle@users.noreply.github.com> Date: Wed, 30 Mar 2022 23:12:27 +0200 Subject: [PATCH 2/7] add test for new regex --- src/documents/tests/test_date_parsing.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index f5987633f..e1c8e5d7b 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -92,6 +92,13 @@ class TestDate(TestCase): datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), ) + def test_date_format_9(self): + text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304" + self.assertEqual( + parse_date("", text), + datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + @override_settings(SCRATCH_DIR=SCRATCH) def test_date_format_9(self): text = "lorem ipsum\n" "27. Nullmonth 2020\n" "März 2020\n" "lorem ipsum" From 0baacbef9899f47d34e8d630fa7ea885e9978ac2 Mon Sep 17 00:00:00 2001 From: Fantasticle <84633558+fantasticle@users.noreply.github.com> Date: Thu, 31 Mar 2022 09:36:10 +0200 Subject: [PATCH 3/7] update new regex pattern for second boundary --- src/documents/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index ca24026fb..269fddbb4 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -23,7 +23,7 @@ from documents.signals import document_consumer_declaration # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits -# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits, MONTH is 3 letters, e.g. 22-FEB-2022 +# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters # TODO: isnt there a date parsing library for this? @@ -33,7 +33,7 @@ DATE_REGEX = re.compile( r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # noqa: E501 r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|" r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([0-9]{1,2}[ \.\/-][A-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][A-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))", # noqa: E501 ) From 3cca77e74847a80e41ba6fce152bf9355d65bf8b Mon Sep 17 00:00:00 2001 From: Fantasticle <84633558+fantasticle@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:24:57 +0200 Subject: [PATCH 4/7] add more tests for regex date parser, remove duplicate name --- src/documents/tests/test_date_parsing.py | 59 +++++++++++++++++++++--- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index e1c8e5d7b..11e639d47 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -3,6 +3,7 @@ import os import shutil from uuid import uuid4 +import pytest from dateutil import tz from django.conf import settings from django.test import override_settings @@ -92,13 +93,6 @@ class TestDate(TestCase): datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), ) - def test_date_format_9(self): - text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304" - self.assertEqual( - parse_date("", text), - datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), - ) - @override_settings(SCRATCH_DIR=SCRATCH) def test_date_format_9(self): text = "lorem ipsum\n" "27. Nullmonth 2020\n" "März 2020\n" "lorem ipsum" @@ -107,6 +101,57 @@ class TestDate(TestCase): datetime.datetime(2020, 3, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), ) + def test_date_format_10(self): + text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304" + self.assertEqual( + parse_date("", text), + datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + def test_date_format_11(self): + text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304" + self.assertEqual( + parse_date("", text), + datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + def test_date_format_12(self): + text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304" + self.assertEqual( + parse_date("", text), + datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + def test_date_format_13(self): + text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304" + self.assertEqual( + parse_date("", text), + datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + def test_date_format_14(self): + text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304" + self.assertEqual( + parse_date("", text), + datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + def test_date_format_15(self): + text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304" + self.assertIsNone(parse_date("", text), None) + + def test_date_format_16(self): + text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304" + self.assertIsNone(parse_date("", text), None) + + def test_date_format_17(self): + text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304" + self.assertIsNone(parse_date("", text), None) + + def test_date_format_18(self): + text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304" + self.assertIsNone(parse_date("", text), None) + def test_crazy_date_past(self, *args): self.assertIsNone(parse_date("", "01-07-0590 00:00:00")) From 4754ac2bd194d6d5beab7adfef29ff5513d0fa0a Mon Sep 17 00:00:00 2001 From: Fantasticle <84633558+fantasticle@users.noreply.github.com> Date: Thu, 31 Mar 2022 21:25:58 +0200 Subject: [PATCH 5/7] remove unnecessary import --- src/documents/tests/test_date_parsing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index 11e639d47..06ad9876c 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -3,7 +3,6 @@ import os import shutil from uuid import uuid4 -import pytest from dateutil import tz from django.conf import settings from django.test import override_settings From db0a58ea0482ff764462841f8782595948afe33c Mon Sep 17 00:00:00 2001 From: Fantasticle <84633558+fantasticle@users.noreply.github.com> Date: Sat, 2 Apr 2022 15:52:32 +0200 Subject: [PATCH 6/7] fix link for post-consumption-example, 404d to jonas --- docs/advanced_usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced_usage.rst b/docs/advanced_usage.rst index 78e12736a..3a1393cc5 100644 --- a/docs/advanced_usage.rst +++ b/docs/advanced_usage.rst @@ -200,7 +200,7 @@ Troubleshooting: - Check your script's permission e.g. in case of permission error ``sudo chmod 755 post-consumption-example.sh`` - Pipe your scripts's output to a log file e.g. ``echo "${DOCUMENT_ID}" | tee --append /usr/src/paperless/scripts/post-consumption-example.log`` -.. _post-consumption-example.sh: https://github.com/jonaswinkler/paperless-ngx/blob/master/scripts/post-consumption-example.sh +.. _post-consumption-example.sh: https://github.com/paperless-ngx/paperless-ngx/blob/master/scripts/post-consumption-example.sh .. _advanced-file_name_handling: From 74422dd000e4a42b6dc4f77e8aa3b33c760f8276 Mon Sep 17 00:00:00 2001 From: Fantasticle <84633558+fantasticle@users.noreply.github.com> Date: Sat, 2 Apr 2022 18:19:11 +0200 Subject: [PATCH 7/7] update name from master to main --- docs/advanced_usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced_usage.rst b/docs/advanced_usage.rst index 3a1393cc5..6a339b8e1 100644 --- a/docs/advanced_usage.rst +++ b/docs/advanced_usage.rst @@ -200,7 +200,7 @@ Troubleshooting: - Check your script's permission e.g. in case of permission error ``sudo chmod 755 post-consumption-example.sh`` - Pipe your scripts's output to a log file e.g. ``echo "${DOCUMENT_ID}" | tee --append /usr/src/paperless/scripts/post-consumption-example.log`` -.. _post-consumption-example.sh: https://github.com/paperless-ngx/paperless-ngx/blob/master/scripts/post-consumption-example.sh +.. _post-consumption-example.sh: https://github.com/paperless-ngx/paperless-ngx/blob/main/scripts/post-consumption-example.sh .. _advanced-file_name_handling: