diff --git a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf deleted file mode 100644 index 629125956..000000000 Binary files a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png deleted file mode 100644 index 4a7671635..000000000 Binary files a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_1.pdf b/src/paperless_tesseract/tests/samples/tests_date_1.pdf deleted file mode 100644 index 95452c515..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_1.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_1.png b/src/paperless_tesseract/tests/samples/tests_date_1.png deleted file mode 100644 index b83ab569a..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_1.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_2.pdf b/src/paperless_tesseract/tests/samples/tests_date_2.pdf deleted file mode 100644 index 0f4013a0e..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_2.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_2.png b/src/paperless_tesseract/tests/samples/tests_date_2.png deleted file mode 100644 index dabb6ee09..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_2.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_3.pdf b/src/paperless_tesseract/tests/samples/tests_date_3.pdf deleted file mode 100644 index 1a91a1c71..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_3.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_3.png b/src/paperless_tesseract/tests/samples/tests_date_3.png deleted file mode 100644 index f45a296ed..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_3.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_4.pdf b/src/paperless_tesseract/tests/samples/tests_date_4.pdf deleted file mode 100644 index d89dd1692..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_4.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_4.png b/src/paperless_tesseract/tests/samples/tests_date_4.png deleted file mode 100644 index a82f21e88..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_4.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_5.pdf b/src/paperless_tesseract/tests/samples/tests_date_5.pdf deleted file mode 100644 index 717ea4ef4..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_5.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_5.png b/src/paperless_tesseract/tests/samples/tests_date_5.png deleted file mode 100644 index 97afeec01..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_5.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_6.pdf b/src/paperless_tesseract/tests/samples/tests_date_6.pdf deleted file mode 100644 index bda2dc280..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_6.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_6.png b/src/paperless_tesseract/tests/samples/tests_date_6.png deleted file mode 100644 index 425105d35..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_6.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_7.pdf b/src/paperless_tesseract/tests/samples/tests_date_7.pdf deleted file mode 100644 index c10f66883..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_7.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_8.pdf b/src/paperless_tesseract/tests/samples/tests_date_8.pdf deleted file mode 100644 index ea7495f29..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_8.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_9.pdf b/src/paperless_tesseract/tests/samples/tests_date_9.pdf deleted file mode 100644 index 2dedadf56..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_9.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf deleted file mode 100644 index 629125956..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf and /dev/null differ diff --git a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png deleted file mode 100644 index 4a7671635..000000000 Binary files a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index ce599122e..ac2f9648f 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -5,7 +5,7 @@ from unittest import mock from uuid import uuid4 from dateutil import tz -from django.test import TestCase, override_settings +from django.test import TestCase from ..parsers import RasterisedDocumentParser from django.conf import settings @@ -16,46 +16,36 @@ class TestDate(TestCase): SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) + MOCK_SCRATCH = "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH" # NOQA: E501 + def setUp(self): os.makedirs(self.SCRATCH, exist_ok=True) def tearDown(self): shutil.rmtree(self.SCRATCH) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_1(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) document._text = "lorem ipsum 130218 lorem ipsum" self.assertEqual(document.get_date(), None) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_2(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) document._text = "lorem ipsum 2018 lorem ipsum" self.assertEqual(document.get_date(), None) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_3(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) document._text = "lorem ipsum 20180213 lorem ipsum" self.assertEqual(document.get_date(), None) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_4(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) @@ -69,10 +59,7 @@ class TestDate(TestCase): ) ) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_5(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) @@ -89,10 +76,7 @@ class TestDate(TestCase): ) ) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_6(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) @@ -109,10 +93,7 @@ class TestDate(TestCase): ) self.assertEqual(document.get_date(), None) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_7(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) @@ -130,10 +111,7 @@ class TestDate(TestCase): ) ) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_8(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) @@ -157,10 +135,7 @@ class TestDate(TestCase): ) ) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_date_format_9(self): input_file = os.path.join(self.SAMPLE_FILES, "") document = RasterisedDocumentParser(input_file) @@ -178,398 +153,11 @@ class TestDate(TestCase): ) ) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_1_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - date = document.get_date() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - date, - datetime.datetime( - 2018, 4, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_1_png(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), False) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 4, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_2_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2013, 2, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_2_png(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), False) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2013, 2, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - @override_settings(OCR_LANGUAGE="deu") - def test_get_text_3_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 10, 5, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - @override_settings(OCR_LANGUAGE="deu") - def test_get_text_3_png(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), False) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 10, 5, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - @override_settings(OCR_LANGUAGE="eng") - def test_get_text_4_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 10, 5, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - @override_settings(OCR_LANGUAGE="eng") - def test_get_text_4_png(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), False) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 10, 5, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_5_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 12, 17, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_5_png(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), False) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 12, 17, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_6_pdf_us(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf") - document = RasterisedDocumentParser(input_file) - document.get_text() - document.DATE_ORDER = "MDY" - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 12, 17, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_6_png_us(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png") - document = RasterisedDocumentParser(input_file) - document.get_text() - document.DATE_ORDER = "MDY" - self.assertEqual(document._is_ocred(), False) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 12, 17, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_6_pdf_eu(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), True) - self.assertEqual(document.get_date(), None) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_6_png_eu(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), False) - self.assertEqual(document.get_date(), None) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_7_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2018, 4, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_8_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2017, 12, 31, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_get_text_9_pdf(self): - input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf") - document = RasterisedDocumentParser(input_file) - document.DATE_ORDER = 'DMY' - document.get_text() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2017, 12, 31, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_filename_date_1_pdf(self): - input_file = os.path.join( - self.SAMPLE_FILES, - "tests_date_in_filename_2018-03-20_1.pdf" - ) - document = RasterisedDocumentParser(input_file) - document.FILENAME_DATE_ORDER = 'YMD' - document.get_text() - date = document.get_date() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - date, - datetime.datetime( - 2018, 3, 20, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_filename_date_1_png(self): - input_file = os.path.join( - self.SAMPLE_FILES, - "tests_date_in_filename_2018-03-20_1.png" - ) - document = RasterisedDocumentParser(input_file) - document.FILENAME_DATE_ORDER = 'YMD' - date = document.get_date() - self.assertEqual(document._is_ocred(), False) - self.assertEqual( - date, - datetime.datetime( - 2018, 3, 20, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_filename_date_2_pdf(self): - input_file = os.path.join( - self.SAMPLE_FILES, - "2013-12-11_tests_date_in_filename_2.pdf" - ) - document = RasterisedDocumentParser(input_file) - document.FILENAME_DATE_ORDER = 'YMD' - date = document.get_date() - self.assertEqual(document._is_ocred(), True) - self.assertEqual( - date, - datetime.datetime( - 2013, 12, 11, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) - def test_filename_date_2_png(self): - input_file = os.path.join( - self.SAMPLE_FILES, - "2013-12-11_tests_date_in_filename_2.png" - ) - document = RasterisedDocumentParser(input_file) - document.FILENAME_DATE_ORDER = 'YMD' - date = document.get_date() - self.assertEqual(document._is_ocred(), False) - self.assertEqual( - date, - datetime.datetime( - 2013, 12, 11, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - @mock.patch( "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", return_value="01-07-0590 00:00:00" ) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_crazy_date_past(self, *args): document = RasterisedDocumentParser("/dev/null") document.get_text() @@ -579,10 +167,7 @@ class TestDate(TestCase): "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", return_value="01-07-2350 00:00:00" ) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_crazy_date_future(self, *args): document = RasterisedDocumentParser("/dev/null") document.get_text() @@ -592,10 +177,7 @@ class TestDate(TestCase): "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", return_value="01-07-0590 00:00:00" ) - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", - SCRATCH - ) + @mock.patch(MOCK_SCRATCH, SCRATCH) def test_crazy_date_past(self, *args): document = RasterisedDocumentParser("/dev/null") document.get_text()