mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #302 from BastianPoe/bugfix/extend_regex_to_find_more_dates
Extends the regex to find dates in documents as reported by @isaacsando
This commit is contained in:
commit
5d01410dc0
@ -202,7 +202,13 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
return text
|
||||
|
||||
def get_date(self):
|
||||
text = self.get_text()
|
||||
date = None
|
||||
datestring = None
|
||||
|
||||
try:
|
||||
text = self.get_text()
|
||||
except ParseError as e:
|
||||
return None
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
@ -210,19 +216,38 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||
# - MONTH ZZZZ
|
||||
m = re.search(
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
pattern = re.compile(
|
||||
r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
|
||||
r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^ ]{3,9} [0-9]{4})\b', text)
|
||||
r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
|
||||
r'\b([^\W\d_]{3,9} [0-9]{4})\b')
|
||||
|
||||
if m is None:
|
||||
return None
|
||||
# Iterate through all regex matches and try to parse the date
|
||||
for m in re.finditer(pattern, text):
|
||||
datestring = m.group(0)
|
||||
|
||||
return dateparser.parse(m.group(0),
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
try:
|
||||
date = dateparser.parse(
|
||||
datestring,
|
||||
settings={'DATE_ORDER': self.DATE_ORDER,
|
||||
'PREFER_DAY_OF_MONTH': 'first',
|
||||
'RETURN_AS_TIMEZONE_AWARE': True})
|
||||
except TypeError:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None:
|
||||
break
|
||||
|
||||
if date is not None:
|
||||
self.log("info", "Detected document date " + date.strftime("%x") +
|
||||
" based on string " + datestring)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
|
||||
|
||||
def run_convert(*args):
|
||||
|
BIN
src/paperless_tesseract/tests/samples/tests_date_8.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_8.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/tests_date_9.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/tests_date_9.pdf
Normal file
Binary file not shown.
@ -25,6 +25,97 @@ class TestDate(TestCase):
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SCRATCH
|
||||
)
|
||||
def test_date_format_1(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.TEXT_CACHE = "lorem ipsum 130218 lorem ipsum"
|
||||
self.assertEqual(document.get_date(),
|
||||
None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_2(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.TEXT_CACHE = "lorem ipsum 2018 lorem ipsum"
|
||||
self.assertEqual(document.get_date(),
|
||||
None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_3(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.TEXT_CACHE = "lorem ipsum 20180213 lorem ipsum"
|
||||
self.assertEqual(document.get_date(),
|
||||
None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_4(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.TEXT_CACHE = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_5(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.TEXT_CACHE = ("lorem ipsum 130218, 2018, 20180213 and "
|
||||
"13.02.2018 lorem ipsum")
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_6(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.TEXT_CACHE = ("lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum")
|
||||
self.assertEqual(document.get_date(),
|
||||
None)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_date_format_7(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.TEXT_CACHE = ("lorem ipsum\n"
|
||||
"März 2019\n"
|
||||
"lorem ipsum")
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2019, 3, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_get_text_1_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
@ -213,3 +304,29 @@ class TestDate(TestCase):
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2018, 4, 1, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_get_text_8_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2017, 12, 31, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||||
SAMPLE_FILES
|
||||
)
|
||||
def test_get_text_9_pdf(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
|
||||
document = RasterisedDocumentParser(input_file)
|
||||
document.get_text()
|
||||
self.assertEqual(document._is_ocred(), True)
|
||||
self.assertEqual(document.get_date(),
|
||||
datetime.datetime(2017, 12, 31, 0, 0,
|
||||
tzinfo=tz.tzutc()))
|
||||
|
Loading…
x
Reference in New Issue
Block a user