mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Improved regular expression to only match for (unicode) characters in month names + parsed one regex match after another until one gave a parsable date
This commit is contained in:
parent
1737e27b34
commit
96c7222269
@ -200,6 +200,9 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
def get_date(self):
|
def get_date(self):
|
||||||
|
date = None
|
||||||
|
datestring = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text = self.get_text()
|
text = self.get_text()
|
||||||
except ParseError as e:
|
except ParseError as e:
|
||||||
@ -213,23 +216,32 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||||
m = re.search(
|
pattern = re.compile(
|
||||||
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||||
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||||
r'\b([\w]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
|
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
|
||||||
r'\b([\w]{3,9} [0-9]{4})\b', text)
|
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
|
||||||
|
|
||||||
if m is None:
|
# Iterate through all regex matches and try to parse the date
|
||||||
return None
|
for m in re.finditer(pattern, text):
|
||||||
|
datestring = m.group(0)
|
||||||
|
|
||||||
date = dateparser.parse(m.group(0),
|
try:
|
||||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
date = dateparser.parse(
|
||||||
'PREFER_DAY_OF_MONTH': 'first',
|
datestring,
|
||||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||||
|
'PREFER_DAY_OF_MONTH': 'first',
|
||||||
|
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||||
|
except TypeError:
|
||||||
|
# Skip all matches that do not parse to a proper date
|
||||||
|
continue
|
||||||
|
|
||||||
|
if date is not None:
|
||||||
|
break
|
||||||
|
|
||||||
if date is not None:
|
if date is not None:
|
||||||
self.log("info", "Detected document date " + date.strftime("%x") +
|
self.log("info", "Detected document date " + date.strftime("%x") +
|
||||||
" based on string " + m.group(0))
|
" based on string " + datestring)
|
||||||
else:
|
else:
|
||||||
self.log("info", "Unable to detect date for document")
|
self.log("info", "Unable to detect date for document")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user