Handle dateparser ValueErrors

When parsing dates from the document text or filenames, correctly handle values
errors indicating broken dates. Newly added tests ensure that this handling
works properly.
This commit is contained in:
Johannes Wienke 2020-03-08 18:35:28 +01:00
parent a3aab0cb48
commit a311cd498c
2 changed files with 28 additions and 2 deletions

View File

@ -108,7 +108,7 @@ class DocumentParser:
try:
date = __parser(date_string, self.FILENAME_DATE_ORDER)
except TypeError:
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
@ -134,7 +134,7 @@ class DocumentParser:
try:
date = __parser(date_string, self.DATE_ORDER)
except TypeError:
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue

View File

@ -172,3 +172,29 @@ class TestDate(TestCase):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="20 408000l 2475"
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_crazy_date_with_spaces(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="No date in here"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser."
"FILENAME_DATE_ORDER",
new_callable=mock.PropertyMock,
return_value="YMD"
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_filename_date_parse_invalid(self, *args):
document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf")
document.get_text()
self.assertIsNone(document.get_date())