Merge pull request #302 from BastianPoe/bugfix/extend_regex_to_find_more_dates

Extends the regex to find dates in documents as reported by @isaacsando
This commit is contained in:
Daniel Quinn 2018-02-18 17:23:49 +01:00 committed by GitHub
commit 5d01410dc0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 153 additions and 11 deletions

View File

@ -202,7 +202,13 @@ class RasterisedDocumentParser(DocumentParser):
return text
def get_date(self):
text = self.get_text()
date = None
datestring = None
try:
text = self.get_text()
except ParseError as e:
return None
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
@ -210,19 +216,38 @@ class RasterisedDocumentParser(DocumentParser):
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ
m = re.search(
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
pattern = re.compile(
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^ ]{3,9} [0-9]{4})\b', text)
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
if m is None:
return None
# Iterate through all regex matches and try to parse the date
for m in re.finditer(pattern, text):
datestring = m.group(0)
return dateparser.parse(m.group(0),
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
try:
date = dateparser.parse(
datestring,
settings={'DATE_ORDER': self.DATE_ORDER,
'PREFER_DAY_OF_MONTH': 'first',
'RETURN_AS_TIMEZONE_AWARE': True})
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None:
break
if date is not None:
self.log("info", "Detected document date " + date.strftime("%x") +
" based on string " + datestring)
else:
self.log("info", "Unable to detect date for document")
return date
def run_convert(*args):

Binary file not shown.

Binary file not shown.

View File

@ -25,6 +25,97 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_1(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document.TEXT_CACHE = "lorem ipsum 130218 lorem ipsum"
self.assertEqual(document.get_date(),
None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_2(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document.TEXT_CACHE = "lorem ipsum 2018 lorem ipsum"
self.assertEqual(document.get_date(),
None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_3(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document.TEXT_CACHE = "lorem ipsum 20180213 lorem ipsum"
self.assertEqual(document.get_date(),
None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_4(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document.TEXT_CACHE = "lorem ipsum 13.02.2018 lorem ipsum"
self.assertEqual(document.get_date(),
datetime.datetime(2018, 2, 13, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_5(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document.TEXT_CACHE = ("lorem ipsum 130218, 2018, 20180213 and "
"13.02.2018 lorem ipsum")
self.assertEqual(document.get_date(),
datetime.datetime(2018, 2, 13, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_6(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document.TEXT_CACHE = ("lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum")
self.assertEqual(document.get_date(),
None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_date_format_7(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document.TEXT_CACHE = ("lorem ipsum\n"
"März 2019\n"
"lorem ipsum")
self.assertEqual(document.get_date(),
datetime.datetime(2019, 3, 1, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_1_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
document = RasterisedDocumentParser(input_file)
@ -213,3 +304,29 @@ class TestDate(TestCase):
self.assertEqual(document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_8_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0,
tzinfo=tz.tzutc()))
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
def test_get_text_9_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0,
tzinfo=tz.tzutc()))