From 56bd406c414ff2d4ecf73d4dc7198e67f077eff4 Mon Sep 17 00:00:00 2001 From: XstreamGit Date: Thu, 30 Jan 2025 23:59:27 +0100 Subject: [PATCH] Avoid matching 4 char years with regex meant for days given as ordinal numbers Original regex is probably meant to match dates like: 1st Jan 2012 but it also matches 127012025 21.01.2025 because the two chars following the first two can be any character including numbers. This change excludes numbers there so dates following numbers are parsed correctly. --- src/documents/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 2d73dc63f..d840817e4 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -41,7 +41,7 @@ DATE_REGEX = re.compile( r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|" r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|" r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))(\d{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|" r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))", re.IGNORECASE, )