diff --git a/docs/guesswork.rst b/docs/guesswork.rst index 4c2a67595..0e728d7af 100644 --- a/docs/guesswork.rst +++ b/docs/guesswork.rst @@ -92,11 +92,11 @@ text and matching algorithm. From the help info there: uses a regex to match the PDF. If you don't know what a regex is, you probably don't want this option. -When using the "any" or "all" matching algorithms, you can search for terms that -consist of multiple words by enclosing them in double quotes. For example, defining -a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match -documents that contain either "Bank of America" or "BofA", but will not match -documents containing "Bank of South America". +When using the "any" or "all" matching algorithms, you can search for terms +that consist of multiple words by enclosing them in double quotes. For example, +defining a match text of ``"Bank of America" BofA`` using the "any" algorithm, +will match documents that contain either "Bank of America" or "BofA", but will +not match documents containing "Bank of South America". Then just save your tag/correspondent and run another document through the consumer. Once complete, you should see the newly-created document, diff --git a/paperless.conf.example b/paperless.conf.example index 0b7d358c9..58d3f09d9 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -196,8 +196,9 @@ PAPERLESS_DEBUG="false" #PAPERLESS_CONSUMER_LOOP_TIME=10 -# By default Paperless stops consuming a document if no language can be detected. -# Set to true to consume documents even if the language detection fails. +# By default Paperless stops consuming a document if no language can be +# detected. Set to true to consume documents even if the language detection +# fails. #PAPERLESS_FORGIVING_OCR="false" diff --git a/src/documents/parsers.py b/src/documents/parsers.py index fec6ed67c..142ebba68 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -21,9 +21,9 @@ from django.utils import timezone # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits DATE_REGEX = re.compile( - r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + - r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + - r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + + r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 + r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 + r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501 r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' + r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))' ) @@ -80,15 +80,20 @@ class DocumentParser: Returns the date of the document. """ - def __parser__(ds, date_order): + def __parser(ds, date_order): """ Call dateparser.parse with a particular date ordering """ - return dateparser.parse(ds, - settings={"DATE_ORDER": date_order, - "PREFER_DAY_OF_MONTH": "first", - "RETURN_AS_TIMEZONE_AWARE": - True}) + return dateparser.parse( + ds, + settings={ + "DATE_ORDER": date_order, + "PREFER_DAY_OF_MONTH": "first", + "RETURN_AS_TIMEZONE_AWARE": + True + } + ) + date = None date_string = None @@ -102,16 +107,18 @@ class DocumentParser: date_string = m.group(0) try: - date = __parser__(date_string, self.FILENAME_DATE_ORDER) + date = __parser(date_string, self.FILENAME_DATE_ORDER) except TypeError: # Skip all matches that do not parse to a proper date continue if date is not None and next_year > date.year > 1900: - self.log("info", - "Detected document date {} based on string {} " - "from document title" - "".format(date.isoformat(), date_string)) + self.log( + "info", + "Detected document date {} based on string {} " + "from document title" + "".format(date.isoformat(), date_string) + ) return date try: @@ -126,7 +133,7 @@ class DocumentParser: date_string = m.group(0) try: - date = __parser__(date_string, self.DATE_ORDER) + date = __parser(date_string, self.DATE_ORDER) except TypeError: # Skip all matches that do not parse to a proper date continue diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index a45f86cbe..ce599122e 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -63,8 +63,10 @@ class TestDate(TestCase): date = document.get_date() self.assertEqual( date, - datetime.datetime(2018, 2, 13, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 2, 13, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -76,12 +78,15 @@ class TestDate(TestCase): document = RasterisedDocumentParser(input_file) document._text = ( "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " - "ipsum") + "ipsum" + ) date = document.get_date() self.assertEqual( date, - datetime.datetime(2018, 2, 13, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 2, 13, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -119,8 +124,10 @@ class TestDate(TestCase): date = document.get_date() self.assertEqual( date, - datetime.datetime(2019, 3, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2019, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -130,20 +137,25 @@ class TestDate(TestCase): def test_date_format_8(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) - document._text = ("lorem ipsum\n" - "Wohnort\n" - "3100\n" - "IBAN\n" - "AT87 4534\n" - "1234\n" - "1234 5678\n" - "BIC\n" - "lorem ipsum\n" - "März 2020") - self.assertEqual(document.get_date(), - datetime.datetime(2020, 3, 1, 0, 0, - tzinfo=tz.gettz( - settings.TIME_ZONE))) + document._text = ( + "lorem ipsum\n" + "Wohnort\n" + "3100\n" + "IBAN\n" + "AT87 4534\n" + "1234\n" + "1234 5678\n" + "BIC\n" + "lorem ipsum\n" + "März 2020" + ) + self.assertEqual( + document.get_date(), + datetime.datetime( + 2020, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) @mock.patch( "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", @@ -152,14 +164,19 @@ class TestDate(TestCase): def test_date_format_9(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) - document._text = ("lorem ipsum\n" - "27. Nullmonth 2020\n" - "März 2020\n" - "lorem ipsum") - self.assertEqual(document.get_date(), - datetime.datetime(2020, 3, 1, 0, 0, - tzinfo=tz.gettz( - settings.TIME_ZONE))) + document._text = ( + "lorem ipsum\n" + "27. Nullmonth 2020\n" + "März 2020\n" + "lorem ipsum" + ) + self.assertEqual( + document.get_date(), + datetime.datetime( + 2020, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) @mock.patch( "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", @@ -174,8 +191,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( date, - datetime.datetime(2018, 4, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 4, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -190,8 +209,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 4, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 4, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -206,8 +227,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2013, 2, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2013, 2, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -222,8 +245,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2013, 2, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2013, 2, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -239,8 +264,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 10, 5, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 10, 5, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -256,8 +283,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 10, 5, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 10, 5, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -273,8 +302,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 10, 5, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 10, 5, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -290,8 +321,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 10, 5, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 10, 5, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -306,8 +339,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 12, 17, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 12, 17, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -322,8 +357,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 12, 17, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 12, 17, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -338,8 +375,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 12, 17, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 12, 17, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -354,8 +393,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 12, 17, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 12, 17, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -394,8 +435,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 4, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 4, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -410,8 +453,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2017, 12, 31, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2017, 12, 31, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -426,8 +471,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2017, 12, 31, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2017, 12, 31, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -435,8 +482,10 @@ class TestDate(TestCase): SCRATCH ) def test_filename_date_1_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, - "tests_date_in_filename_2018-03-20_1.pdf") + input_file = os.path.join( + self.SAMPLE_FILES, + "tests_date_in_filename_2018-03-20_1.pdf" + ) document = RasterisedDocumentParser(input_file) document.FILENAME_DATE_ORDER = 'YMD' document.get_text() @@ -444,8 +493,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( date, - datetime.datetime(2018, 3, 20, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 3, 20, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -453,16 +504,20 @@ class TestDate(TestCase): SCRATCH ) def test_filename_date_1_png(self): - input_file = os.path.join(self.SAMPLE_FILES, - "tests_date_in_filename_2018-03-20_1.png") + input_file = os.path.join( + self.SAMPLE_FILES, + "tests_date_in_filename_2018-03-20_1.png" + ) document = RasterisedDocumentParser(input_file) document.FILENAME_DATE_ORDER = 'YMD' date = document.get_date() self.assertEqual(document._is_ocred(), False) self.assertEqual( date, - datetime.datetime(2018, 3, 20, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2018, 3, 20, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -470,16 +525,20 @@ class TestDate(TestCase): SCRATCH ) def test_filename_date_2_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, - "2013-12-11_tests_date_in_filename_2.pdf") + input_file = os.path.join( + self.SAMPLE_FILES, + "2013-12-11_tests_date_in_filename_2.pdf" + ) document = RasterisedDocumentParser(input_file) document.FILENAME_DATE_ORDER = 'YMD' date = document.get_date() self.assertEqual(document._is_ocred(), True) self.assertEqual( date, - datetime.datetime(2013, 12, 11, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2013, 12, 11, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -487,16 +546,20 @@ class TestDate(TestCase): SCRATCH ) def test_filename_date_2_png(self): - input_file = os.path.join(self.SAMPLE_FILES, - "2013-12-11_tests_date_in_filename_2.png") + input_file = os.path.join( + self.SAMPLE_FILES, + "2013-12-11_tests_date_in_filename_2.png" + ) document = RasterisedDocumentParser(input_file) document.FILENAME_DATE_ORDER = 'YMD' date = document.get_date() self.assertEqual(document._is_ocred(), False) self.assertEqual( date, - datetime.datetime(2013, 12, 11, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE)) + datetime.datetime( + 2013, 12, 11, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch(