Improved regular expression to only match for (unicode) characters in month names + parsed one regex match after another until one gave a parsable date

This commit is contained in:
Wolf-Bastian Pöttner 2018-02-14 21:33:19 +01:00
parent 1737e27b34
commit 96c7222269

View File

@ -200,6 +200,9 @@ class RasterisedDocumentParser(DocumentParser):
return text
def get_date(self):
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
@ -213,23 +216,32 @@ class RasterisedDocumentParser(DocumentParser):
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
m = re.search(
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([\w]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([\w]{3,9} [0-9]{4})\b', text)
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
if m is None:
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
datestring = m.group(0)
date = dateparser.parse(m.group(0),
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
break
if date is not None:
self.log("info", "Detected document date " + date.strftime("%x") +
" based on string " + m.group(0))
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")