diff --git a/docs/guesswork.rst b/docs/guesswork.rst index 54982e96c..0e728d7af 100644 --- a/docs/guesswork.rst +++ b/docs/guesswork.rst @@ -43,6 +43,16 @@ These however wouldn't work: * ``Some Company Name, Invoice 2016-01-01, money, invoices.pdf`` * ``Another Company- Letter of Reference.jpg`` +Do I have to be so strict about naming? +--------------------------------------- +Rather than using the strict document naming rules, one can also set the option +``PAPERLESS_FILENAME_DATE_ORDER`` in ``paperless.conf`` to any date order +that is accepted by dateparser_. Doing so will cause ``paperless`` to default +to any date format that is found in the title, instead of a date pulled from +the document's text, without requiring the strict formatting of the document +filename as described above. + +.. _dateparser: https://github.com/scrapinghub/dateparser/blob/v0.7.0/docs/usage.rst#settings .. _guesswork-content: @@ -82,11 +92,11 @@ text and matching algorithm. From the help info there: uses a regex to match the PDF. If you don't know what a regex is, you probably don't want this option. -When using the "any" or "all" matching algorithms, you can search for terms that -consist of multiple words by enclosing them in double quotes. For example, defining -a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match -documents that contain either "Bank of America" or "BofA", but will not match -documents containing "Bank of South America". +When using the "any" or "all" matching algorithms, you can search for terms +that consist of multiple words by enclosing them in double quotes. For example, +defining a match text of ``"Bank of America" BofA`` using the "any" algorithm, +will match documents that contain either "Bank of America" or "BofA", but will +not match documents containing "Bank of South America". Then just save your tag/correspondent and run another document through the consumer. Once complete, you should see the newly-created document, diff --git a/paperless.conf.example b/paperless.conf.example index 11e6d905b..58d3f09d9 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -127,6 +127,14 @@ PAPERLESS_DEBUG="false" # "true", the document will instead be opened in the browser, if possible. #PAPERLESS_INLINE_DOC="false" +# By default, paperless will check the document text for document date information. +# Uncomment the line below to enable checking the document filename for date +# information. The date order can be set to any option as specified in +# https://dateparser.readthedocs.io/en/latest/#settings. The filename will be +# checked first, and if nothing is found, the document text will be checked +# as normal. +#PAPERLESS_FILENAME_DATE_ORDER="YMD" + # # The following values use sensible defaults for modern systems, but if you're # running Paperless on a low-resource device (like a Raspberry Pi), modifying @@ -188,8 +196,9 @@ PAPERLESS_DEBUG="false" #PAPERLESS_CONSUMER_LOOP_TIME=10 -# By default Paperless stops consuming a document if no language can be detected. -# Set to true to consume documents even if the language detection fails. +# By default Paperless stops consuming a document if no language can be +# detected. Set to true to consume documents even if the language detection +# fails. #PAPERLESS_FORGIVING_OCR="false" diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 1f60b1479..142ebba68 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -14,14 +14,18 @@ from django.utils import timezone # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits +# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits DATE_REGEX = re.compile( - r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' + - r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' + - r'\b([^\W\d_]{3,9} [0-9]{4})\b' + r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501 + r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501 + r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501 + r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' + + r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))' ) @@ -37,6 +41,7 @@ class DocumentParser: SCRATCH = settings.SCRATCH_DIR DATE_ORDER = settings.DATE_ORDER + FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER OPTIPNG = settings.OPTIPNG_BINARY def __init__(self, path): @@ -75,30 +80,60 @@ class DocumentParser: Returns the date of the document. """ + def __parser(ds, date_order): + """ + Call dateparser.parse with a particular date ordering + """ + return dateparser.parse( + ds, + settings={ + "DATE_ORDER": date_order, + "PREFER_DAY_OF_MONTH": "first", + "RETURN_AS_TIMEZONE_AWARE": + True + } + ) + date = None date_string = None + next_year = timezone.now().year + 5 # Arbitrary 5 year future limit + title = os.path.basename(self.document_path) + + # if filename date parsing is enabled, search there first: + if self.FILENAME_DATE_ORDER: + self.log("info", "Checking document title for date") + for m in re.finditer(DATE_REGEX, title): + date_string = m.group(0) + + try: + date = __parser(date_string, self.FILENAME_DATE_ORDER) + except TypeError: + # Skip all matches that do not parse to a proper date + continue + + if date is not None and next_year > date.year > 1900: + self.log( + "info", + "Detected document date {} based on string {} " + "from document title" + "".format(date.isoformat(), date_string) + ) + return date + try: + # getting text after checking filename will save time if only + # looking at the filename instead of the whole text text = self.get_text() except ParseError: return None - next_year = timezone.now().year + 5 # Arbitrary 5 year future limit - - # Iterate through all regex matches and try to parse the date + # Iterate through all regex matches in text and try to parse the date for m in re.finditer(DATE_REGEX, text): - date_string = m.group(0) try: - date = dateparser.parse( - date_string, - settings={ - "DATE_ORDER": self.DATE_ORDER, - "PREFER_DAY_OF_MONTH": "first", - "RETURN_AS_TIMEZONE_AWARE": True - } - ) + date = __parser(date_string, self.DATE_ORDER) except TypeError: # Skip all matches that do not parse to a proper date continue diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 97226ef44..3583ca03a 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -306,6 +306,7 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END") # Specify the default date order (for autodetected dates) DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY") +FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER") # Specify for how many years a correspondent is considered recent. Recent # correspondents will be shown in a separate "Recent correspondents" filter as diff --git a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf new file mode 100644 index 000000000..629125956 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf differ diff --git a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png new file mode 100644 index 000000000..4a7671635 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf new file mode 100644 index 000000000..629125956 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png new file mode 100644 index 000000000..4a7671635 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png differ diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index 3959ded31..ce599122e 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -8,6 +8,7 @@ from dateutil import tz from django.test import TestCase, override_settings from ..parsers import RasterisedDocumentParser +from django.conf import settings class TestDate(TestCase): @@ -59,9 +60,13 @@ class TestDate(TestCase): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) document._text = "lorem ipsum 13.02.2018 lorem ipsum" + date = document.get_date() self.assertEqual( - document.get_date(), - datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc()) + date, + datetime.datetime( + 2018, 2, 13, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -72,10 +77,16 @@ class TestDate(TestCase): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) document._text = ( - "lorem ipsum 130218, 2018, 20180213 and 13.02.2018 lorem ipsum") + "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " + "ipsum" + ) + date = document.get_date() self.assertEqual( - document.get_date(), - datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc()) + date, + datetime.datetime( + 2018, 2, 13, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -110,9 +121,13 @@ class TestDate(TestCase): "März 2019\n" "lorem ipsum" ) + date = document.get_date() self.assertEqual( - document.get_date(), - datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.tzutc()) + date, + datetime.datetime( + 2019, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -122,19 +137,25 @@ class TestDate(TestCase): def test_date_format_8(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) - document._text = ("lorem ipsum\n" - "Wohnort\n" - "3100\n" - "IBAN\n" - "AT87 4534\n" - "1234\n" - "1234 5678\n" - "BIC\n" - "lorem ipsum\n" - "März 2020") - self.assertEqual(document.get_date(), - datetime.datetime(2020, 3, 1, 0, 0, - tzinfo=tz.tzutc())) + document._text = ( + "lorem ipsum\n" + "Wohnort\n" + "3100\n" + "IBAN\n" + "AT87 4534\n" + "1234\n" + "1234 5678\n" + "BIC\n" + "lorem ipsum\n" + "März 2020" + ) + self.assertEqual( + document.get_date(), + datetime.datetime( + 2020, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) @mock.patch( "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", @@ -143,13 +164,19 @@ class TestDate(TestCase): def test_date_format_9(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) - document._text = ("lorem ipsum\n" - "27. Nullmonth 2020\n" - "März 2020\n" - "lorem ipsum") - self.assertEqual(document.get_date(), - datetime.datetime(2020, 3, 1, 0, 0, - tzinfo=tz.tzutc())) + document._text = ( + "lorem ipsum\n" + "27. Nullmonth 2020\n" + "März 2020\n" + "lorem ipsum" + ) + self.assertEqual( + document.get_date(), + datetime.datetime( + 2020, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) @mock.patch( "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", @@ -158,11 +185,16 @@ class TestDate(TestCase): def test_get_text_1_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() + date = document.get_date() self.assertEqual(document._is_ocred(), True) self.assertEqual( - document.get_date(), - datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc()) + date, + datetime.datetime( + 2018, 4, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -172,11 +204,15 @@ class TestDate(TestCase): def test_get_text_1_png(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 4, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -186,11 +222,15 @@ class TestDate(TestCase): def test_get_text_2_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2013, 2, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -200,11 +240,15 @@ class TestDate(TestCase): def test_get_text_2_png(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2013, 2, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -215,11 +259,15 @@ class TestDate(TestCase): def test_get_text_3_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 10, 5, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -230,11 +278,15 @@ class TestDate(TestCase): def test_get_text_3_png(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 10, 5, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -245,11 +297,15 @@ class TestDate(TestCase): def test_get_text_4_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 10, 5, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -260,11 +316,15 @@ class TestDate(TestCase): def test_get_text_4_png(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 10, 5, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -274,11 +334,15 @@ class TestDate(TestCase): def test_get_text_5_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 12, 17, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -288,11 +352,15 @@ class TestDate(TestCase): def test_get_text_5_png(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 12, 17, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -307,7 +375,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 12, 17, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -322,7 +393,10 @@ class TestDate(TestCase): self.assertEqual(document._is_ocred(), False) self.assertEqual( document.get_date(), - datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 12, 17, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -332,6 +406,7 @@ class TestDate(TestCase): def test_get_text_6_pdf_eu(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), True) self.assertEqual(document.get_date(), None) @@ -343,6 +418,7 @@ class TestDate(TestCase): def test_get_text_6_png_eu(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), False) self.assertEqual(document.get_date(), None) @@ -354,11 +430,15 @@ class TestDate(TestCase): def test_get_text_7_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2018, 4, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -368,11 +448,15 @@ class TestDate(TestCase): def test_get_text_8_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2017, 12, 31, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch( @@ -382,11 +466,100 @@ class TestDate(TestCase): def test_get_text_9_pdf(self): input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf") document = RasterisedDocumentParser(input_file) + document.DATE_ORDER = 'DMY' document.get_text() self.assertEqual(document._is_ocred(), True) self.assertEqual( document.get_date(), - datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc()) + datetime.datetime( + 2017, 12, 31, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SCRATCH + ) + def test_filename_date_1_pdf(self): + input_file = os.path.join( + self.SAMPLE_FILES, + "tests_date_in_filename_2018-03-20_1.pdf" + ) + document = RasterisedDocumentParser(input_file) + document.FILENAME_DATE_ORDER = 'YMD' + document.get_text() + date = document.get_date() + self.assertEqual(document._is_ocred(), True) + self.assertEqual( + date, + datetime.datetime( + 2018, 3, 20, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SCRATCH + ) + def test_filename_date_1_png(self): + input_file = os.path.join( + self.SAMPLE_FILES, + "tests_date_in_filename_2018-03-20_1.png" + ) + document = RasterisedDocumentParser(input_file) + document.FILENAME_DATE_ORDER = 'YMD' + date = document.get_date() + self.assertEqual(document._is_ocred(), False) + self.assertEqual( + date, + datetime.datetime( + 2018, 3, 20, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SCRATCH + ) + def test_filename_date_2_pdf(self): + input_file = os.path.join( + self.SAMPLE_FILES, + "2013-12-11_tests_date_in_filename_2.pdf" + ) + document = RasterisedDocumentParser(input_file) + document.FILENAME_DATE_ORDER = 'YMD' + date = document.get_date() + self.assertEqual(document._is_ocred(), True) + self.assertEqual( + date, + datetime.datetime( + 2013, 12, 11, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SCRATCH + ) + def test_filename_date_2_png(self): + input_file = os.path.join( + self.SAMPLE_FILES, + "2013-12-11_tests_date_in_filename_2.png" + ) + document = RasterisedDocumentParser(input_file) + document.FILENAME_DATE_ORDER = 'YMD' + date = document.get_date() + self.assertEqual(document._is_ocred(), False) + self.assertEqual( + date, + datetime.datetime( + 2013, 12, 11, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) ) @mock.patch(