mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Merge pull request #302 from BastianPoe/bugfix/extend_regex_to_find_more_dates
Extends the regex to find dates in documents as reported by @isaacsando
This commit is contained in:
		@@ -202,7 +202,13 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
        return text
 | 
			
		||||
 | 
			
		||||
    def get_date(self):
 | 
			
		||||
        text = self.get_text()
 | 
			
		||||
        date = None
 | 
			
		||||
        datestring = None
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            text = self.get_text()
 | 
			
		||||
        except ParseError as e:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        # This regular expression will try to find dates in the document at
 | 
			
		||||
        # hand and will match the following formats:
 | 
			
		||||
@@ -210,19 +216,38 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
        # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 | 
			
		||||
        # - MONTH ZZZZ
 | 
			
		||||
        m = re.search(
 | 
			
		||||
        # - MONTH ZZZZ, with ZZZZ being 4 digits
 | 
			
		||||
        # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 | 
			
		||||
        pattern = re.compile(
 | 
			
		||||
            r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
 | 
			
		||||
            r'\b([0-9]{1,2}\. [^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
 | 
			
		||||
            r'\b([^ ]{3,9} [0-9]{4})\b', text)
 | 
			
		||||
            r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
 | 
			
		||||
            r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
 | 
			
		||||
            r'\b([^\W\d_]{3,9} [0-9]{4})\b')
 | 
			
		||||
 | 
			
		||||
        if m is None:
 | 
			
		||||
            return None
 | 
			
		||||
        # Iterate through all regex matches and try to parse the date
 | 
			
		||||
        for m in re.finditer(pattern, text):
 | 
			
		||||
            datestring = m.group(0)
 | 
			
		||||
 | 
			
		||||
        return dateparser.parse(m.group(0),
 | 
			
		||||
                                settings={'DATE_ORDER': self.DATE_ORDER,
 | 
			
		||||
                                          'PREFER_DAY_OF_MONTH': 'first',
 | 
			
		||||
                                          'RETURN_AS_TIMEZONE_AWARE': True})
 | 
			
		||||
            try:
 | 
			
		||||
                date = dateparser.parse(
 | 
			
		||||
                           datestring,
 | 
			
		||||
                           settings={'DATE_ORDER': self.DATE_ORDER,
 | 
			
		||||
                                     'PREFER_DAY_OF_MONTH': 'first',
 | 
			
		||||
                                     'RETURN_AS_TIMEZONE_AWARE': True})
 | 
			
		||||
            except TypeError:
 | 
			
		||||
                # Skip all matches that do not parse to a proper date
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            if date is not None:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
        if date is not None:
 | 
			
		||||
            self.log("info", "Detected document date " + date.strftime("%x") +
 | 
			
		||||
                             " based on string " + datestring)
 | 
			
		||||
        else:
 | 
			
		||||
            self.log("info", "Unable to detect date for document")
 | 
			
		||||
 | 
			
		||||
        return date
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def run_convert(*args):
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/tests_date_8.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/tests_date_8.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/tests_date_9.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/tests_date_9.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							@@ -25,6 +25,97 @@ class TestDate(TestCase):
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SCRATCH
 | 
			
		||||
    )
 | 
			
		||||
    def test_date_format_1(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.TEXT_CACHE = "lorem ipsum 130218 lorem ipsum"
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         None)
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_date_format_2(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.TEXT_CACHE = "lorem ipsum 2018 lorem ipsum"
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         None)
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_date_format_3(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.TEXT_CACHE = "lorem ipsum 20180213 lorem ipsum"
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         None)
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_date_format_4(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.TEXT_CACHE = "lorem ipsum 13.02.2018 lorem ipsum"
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         datetime.datetime(2018, 2, 13, 0, 0,
 | 
			
		||||
                                           tzinfo=tz.tzutc()))
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_date_format_5(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.TEXT_CACHE = ("lorem ipsum 130218, 2018, 20180213 and "
 | 
			
		||||
                               "13.02.2018 lorem ipsum")
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         datetime.datetime(2018, 2, 13, 0, 0,
 | 
			
		||||
                                           tzinfo=tz.tzutc()))
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_date_format_6(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.TEXT_CACHE = ("lorem ipsum\n"
 | 
			
		||||
                               "Wohnort\n"
 | 
			
		||||
                               "3100\n"
 | 
			
		||||
                               "IBAN\n"
 | 
			
		||||
                               "AT87 4534\n"
 | 
			
		||||
                               "1234\n"
 | 
			
		||||
                               "1234 5678\n"
 | 
			
		||||
                               "BIC\n"
 | 
			
		||||
                               "lorem ipsum")
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         None)
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_date_format_7(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.TEXT_CACHE = ("lorem ipsum\n"
 | 
			
		||||
                               "März 2019\n"
 | 
			
		||||
                               "lorem ipsum")
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         datetime.datetime(2019, 3, 1, 0, 0,
 | 
			
		||||
                                           tzinfo=tz.tzutc()))
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_get_text_1_pdf(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
@@ -213,3 +304,29 @@ class TestDate(TestCase):
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         datetime.datetime(2018, 4, 1, 0, 0,
 | 
			
		||||
                                           tzinfo=tz.tzutc()))
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_get_text_8_pdf(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.get_text()
 | 
			
		||||
        self.assertEqual(document._is_ocred(), True)
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         datetime.datetime(2017, 12, 31, 0, 0,
 | 
			
		||||
                                           tzinfo=tz.tzutc()))
 | 
			
		||||
 | 
			
		||||
    @mock.patch(
 | 
			
		||||
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
 | 
			
		||||
        SAMPLE_FILES
 | 
			
		||||
    )
 | 
			
		||||
    def test_get_text_9_pdf(self):
 | 
			
		||||
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
 | 
			
		||||
        document = RasterisedDocumentParser(input_file)
 | 
			
		||||
        document.get_text()
 | 
			
		||||
        self.assertEqual(document._is_ocred(), True)
 | 
			
		||||
        self.assertEqual(document.get_date(),
 | 
			
		||||
                         datetime.datetime(2017, 12, 31, 0, 0,
 | 
			
		||||
                                           tzinfo=tz.tzutc()))
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user