diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 518489d95..c29775b39 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -202,7 +202,13 @@ class RasterisedDocumentParser(DocumentParser): return text def get_date(self): - text = self.get_text() + date = None + datestring = None + + try: + text = self.get_text() + except ParseError as e: + return None # This regular expression will try to find dates in the document at # hand and will match the following formats: @@ -210,19 +216,38 @@ class RasterisedDocumentParser(DocumentParser): # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits - # - MONTH ZZZZ - m = re.search( + # - MONTH ZZZZ, with ZZZZ being 4 digits + # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits + pattern = re.compile( r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + - r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + - r'\b([^ ]{3,9} [0-9]{4})\b', text) + r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + + r'\b([^\W\d_]{3,9} [0-9]{4})\b') - if m is None: - return None + # Iterate through all regex matches and try to parse the date + for m in re.finditer(pattern, text): + datestring = m.group(0) - return dateparser.parse(m.group(0), - settings={'DATE_ORDER': self.DATE_ORDER, - 'PREFER_DAY_OF_MONTH': 'first', - 'RETURN_AS_TIMEZONE_AWARE': True}) + try: + date = dateparser.parse( + datestring, + settings={'DATE_ORDER': self.DATE_ORDER, + 'PREFER_DAY_OF_MONTH': 'first', + 'RETURN_AS_TIMEZONE_AWARE': True}) + except TypeError: + # Skip all matches that do not parse to a proper date + continue + + if date is not None: + break + + if date is not None: + self.log("info", "Detected document date " + date.strftime("%x") + + " based on string " + datestring) + else: + self.log("info", "Unable to detect date for document") + + return date def run_convert(*args): diff --git a/src/paperless_tesseract/tests/samples/tests_date_8.pdf b/src/paperless_tesseract/tests/samples/tests_date_8.pdf new file mode 100644 index 000000000..ea7495f29 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_8.pdf differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_9.pdf b/src/paperless_tesseract/tests/samples/tests_date_9.pdf new file mode 100644 index 000000000..2dedadf56 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_9.pdf differ diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index 9f81b0a56..66d5b9d45 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -25,6 +25,97 @@ class TestDate(TestCase): "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", SCRATCH ) + def test_date_format_1(self): + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document.TEXT_CACHE = "lorem ipsum 130218 lorem ipsum" + self.assertEqual(document.get_date(), + None) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_date_format_2(self): + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document.TEXT_CACHE = "lorem ipsum 2018 lorem ipsum" + self.assertEqual(document.get_date(), + None) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_date_format_3(self): + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document.TEXT_CACHE = "lorem ipsum 20180213 lorem ipsum" + self.assertEqual(document.get_date(), + None) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_date_format_4(self): + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document.TEXT_CACHE = "lorem ipsum 13.02.2018 lorem ipsum" + self.assertEqual(document.get_date(), + datetime.datetime(2018, 2, 13, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_date_format_5(self): + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document.TEXT_CACHE = ("lorem ipsum 130218, 2018, 20180213 and " + "13.02.2018 lorem ipsum") + self.assertEqual(document.get_date(), + datetime.datetime(2018, 2, 13, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_date_format_6(self): + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document.TEXT_CACHE = ("lorem ipsum\n" + "Wohnort\n" + "3100\n" + "IBAN\n" + "AT87 4534\n" + "1234\n" + "1234 5678\n" + "BIC\n" + "lorem ipsum") + self.assertEqual(document.get_date(), + None) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_date_format_7(self): + input_file = os.path.join(self.SAMPLE_FILES, "") + document = RasterisedDocumentParser(input_file) + document.TEXT_CACHE = ("lorem ipsum\n" + "März 2019\n" + "lorem ipsum") + self.assertEqual(document.get_date(), + datetime.datetime(2019, 3, 1, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) def test_get_text_1_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf") document = RasterisedDocumentParser(input_file) @@ -213,3 +304,29 @@ class TestDate(TestCase): self.assertEqual(document.get_date(), datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_8_pdf(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2017, 12, 31, 0, 0, + tzinfo=tz.tzutc())) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SAMPLE_FILES + ) + def test_get_text_9_pdf(self): + input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf") + document = RasterisedDocumentParser(input_file) + document.get_text() + self.assertEqual(document._is_ocred(), True) + self.assertEqual(document.get_date(), + datetime.datetime(2017, 12, 31, 0, 0, + tzinfo=tz.tzutc()))