From 96c72222693d584f1f5cecd56efa62ef6faddce8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wolf-Bastian=20P=C3=B6ttner?= Date: Wed, 14 Feb 2018 21:33:19 +0100 Subject: [PATCH] Improved regular expression to only match for (unicode) characters in month names + parsed one regex match after another until one gave a parsable date --- src/paperless_tesseract/parsers.py | 32 ++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b5f012bdc..f50bc2798 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -200,6 +200,9 @@ class RasterisedDocumentParser(DocumentParser): return text def get_date(self): + date = None + datestring = None + try: text = self.get_text() except ParseError as e: @@ -213,23 +216,32 @@ class RasterisedDocumentParser(DocumentParser): # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits - m = re.search( + pattern = re.compile( r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + - r'\b([\w]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([\w]{3,9} [0-9]{4})\b', text) + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{4})\b') - if m is None: - return None + # Iterate through all regex matches and try to parse the date + for m in re.finditer(pattern, text): + datestring = m.group(0) - date = dateparser.parse(m.group(0), - settings={'DATE_ORDER': self.DATE_ORDER, - 'PREFER_DAY_OF_MONTH': 'first', - 'RETURN_AS_TIMEZONE_AWARE': True}) + try: + date = dateparser.parse( + datestring, + settings={'DATE_ORDER': self.DATE_ORDER, + 'PREFER_DAY_OF_MONTH': 'first', + 'RETURN_AS_TIMEZONE_AWARE': True}) + except TypeError: + # Skip all matches that do not parse to a proper date + continue + + if date is not None: + break if date is not None: self.log("info", "Detected document date " + date.strftime("%x") + - " based on string " + m.group(0)) + " based on string " + datestring) else: self.log("info", "Unable to detect date for document")