Tweak the date guesser to not allow dates prior to 1900 (#414)

This commit is contained in:
Daniel Quinn 2018-10-01 20:03:27 +01:00
parent a511d34d69
commit 8010d72f18
2 changed files with 31 additions and 7 deletions

View File

@ -203,6 +203,7 @@ class RasterisedDocumentParser(DocumentParser):
return text
def get_date(self):
date = None
datestring = None
@ -217,20 +218,30 @@ class RasterisedDocumentParser(DocumentParser):
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
datestring,
settings={
"DATE_ORDER": self.DATE_ORDER,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True
}
)
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
if date is not None and date.year > 1900:
break
else:
date = None
if date is not None:
self.log("info", "Detected document date " + date.isoformat() +
" based on string " + datestring)
self.log(
"info",
"Detected document date {} based on string {}".format(
date.isoformat(),
datestring
)
)
else:
self.log("info", "Unable to detect date for document")

View File

@ -384,3 +384,16 @@ class TestDate(TestCase):
document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())