From 6531a679401901471a6f1c4dad11578df1ce9d1c Mon Sep 17 00:00:00 2001 From: Johannes Wienke Date: Sun, 8 Mar 2020 18:26:29 +0100 Subject: [PATCH 1/2] Remove duplicated date parsing test The exact same tests existed twice in the file. --- src/paperless_tesseract/tests/test_date.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index ac2f9648f..4f931737b 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -172,13 +172,3 @@ class TestDate(TestCase): document = RasterisedDocumentParser("/dev/null") document.get_text() self.assertIsNone(document.get_date()) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", - return_value="01-07-0590 00:00:00" - ) - @mock.patch(MOCK_SCRATCH, SCRATCH) - def test_crazy_date_past(self, *args): - document = RasterisedDocumentParser("/dev/null") - document.get_text() - self.assertIsNone(document.get_date()) From ebcfcea05b60401b4adde04ddc3cb77a693b2c92 Mon Sep 17 00:00:00 2001 From: Johannes Wienke Date: Sun, 8 Mar 2020 18:35:28 +0100 Subject: [PATCH 2/2] Handle dateparser ValueErrors When parsing dates from the document text or filenames, correctly handle values errors indicating broken dates. Newly added tests ensure that this handling works properly. --- src/documents/parsers.py | 4 ++-- src/paperless_tesseract/tests/test_date.py | 26 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 142ebba68..c0a80a55d 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -108,7 +108,7 @@ class DocumentParser: try: date = __parser(date_string, self.FILENAME_DATE_ORDER) - except TypeError: + except (TypeError, ValueError): # Skip all matches that do not parse to a proper date continue @@ -134,7 +134,7 @@ class DocumentParser: try: date = __parser(date_string, self.DATE_ORDER) - except TypeError: + except (TypeError, ValueError): # Skip all matches that do not parse to a proper date continue diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index 4f931737b..9e9d48b90 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -172,3 +172,29 @@ class TestDate(TestCase): document = RasterisedDocumentParser("/dev/null") document.get_text() self.assertIsNone(document.get_date()) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", + return_value="20 408000l 2475" + ) + @mock.patch(MOCK_SCRATCH, SCRATCH) + def test_crazy_date_with_spaces(self, *args): + document = RasterisedDocumentParser("/dev/null") + document.get_text() + self.assertIsNone(document.get_date()) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", + return_value="No date in here" + ) + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser." + "FILENAME_DATE_ORDER", + new_callable=mock.PropertyMock, + return_value="YMD" + ) + @mock.patch(MOCK_SCRATCH, SCRATCH) + def test_filename_date_parse_invalid(self, *args): + document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf") + document.get_text() + self.assertIsNone(document.get_date())