diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 6dab98d02..b03af1363 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -364,35 +364,35 @@ class TestFieldPermutations(TestCase): class DummyParser(DocumentParser): - def get_thumbnail(self): + def get_thumbnail(self, document_path, mime_type): # not important during tests raise NotImplementedError() - def __init__(self, path, logging_group, scratch_dir): - super(DummyParser, self).__init__(path, logging_group) + def __init__(self, logging_group, scratch_dir): + super(DummyParser, self).__init__(logging_group) _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) - def get_optimised_thumbnail(self): + def get_optimised_thumbnail(self, document_path, mime_type): return self.fake_thumb - def get_text(self): - return "The Text" + def parse(self, document_path, mime_type): + self.text = "The Text" class FaultyParser(DocumentParser): - def get_thumbnail(self): + def get_thumbnail(self, document_path, mime_type): # not important during tests raise NotImplementedError() - def __init__(self, path, logging_group, scratch_dir): - super(FaultyParser, self).__init__(path, logging_group) + def __init__(self, logging_group, scratch_dir): + super(FaultyParser, self).__init__(logging_group) _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) - def get_optimised_thumbnail(self): + def get_optimised_thumbnail(self, document_path, mime_type): return self.fake_thumb - def get_text(self): + def parse(self, document_path, mime_type): raise ParseError("Does not compute.") @@ -410,11 +410,11 @@ def fake_magic_from_file(file, mime=False): @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) class TestConsumer(TestCase): - def make_dummy_parser(self, path, logging_group): - return DummyParser(path, logging_group, self.scratch_dir) + def make_dummy_parser(self, logging_group): + return DummyParser(logging_group, self.scratch_dir) - def make_faulty_parser(self, path, logging_group): - return FaultyParser(path, logging_group, self.scratch_dir) + def make_faulty_parser(self, logging_group): + return FaultyParser(logging_group, self.scratch_dir) def setUp(self): self.scratch_dir = tempfile.mkdtemp() diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py new file mode 100644 index 000000000..357b0937e --- /dev/null +++ b/src/documents/tests/test_date_parsing.py @@ -0,0 +1,140 @@ +import datetime +import os +import shutil +from unittest import mock +from uuid import uuid4 + +from dateutil import tz +from django.conf import settings +from django.test import TestCase, override_settings + +from documents.parsers import parse_date +from paperless_tesseract.parsers import RasterisedDocumentParser + + +class TestDate(TestCase): + + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples") + SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) + + def setUp(self): + os.makedirs(self.SCRATCH, exist_ok=True) + + def tearDown(self): + shutil.rmtree(self.SCRATCH) + + def test_date_format_1(self): + text = "lorem ipsum 130218 lorem ipsum" + self.assertEqual(parse_date("", text), None) + + def test_date_format_2(self): + text = "lorem ipsum 2018 lorem ipsum" + self.assertEqual(parse_date("", text), None) + + def test_date_format_3(self): + text = "lorem ipsum 20180213 lorem ipsum" + self.assertEqual(parse_date("", text), None) + + def test_date_format_4(self): + text = "lorem ipsum 13.02.2018 lorem ipsum" + date = parse_date("", text) + self.assertEqual( + date, + datetime.datetime( + 2018, 2, 13, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + def test_date_format_5(self): + text = ( + "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " + "ipsum" + ) + date = parse_date("", text) + self.assertEqual( + date, + datetime.datetime( + 2018, 2, 13, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + def test_date_format_6(self): + text = ( + "lorem ipsum\n" + "Wohnort\n" + "3100\n" + "IBAN\n" + "AT87 4534\n" + "1234\n" + "1234 5678\n" + "BIC\n" + "lorem ipsum" + ) + self.assertEqual(parse_date("", text), None) + + def test_date_format_7(self): + text = ( + "lorem ipsum\n" + "März 2019\n" + "lorem ipsum" + ) + date = parse_date("", text) + self.assertEqual( + date, + datetime.datetime( + 2019, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + def test_date_format_8(self): + text = ( + "lorem ipsum\n" + "Wohnort\n" + "3100\n" + "IBAN\n" + "AT87 4534\n" + "1234\n" + "1234 5678\n" + "BIC\n" + "lorem ipsum\n" + "März 2020" + ) + self.assertEqual( + parse_date("", text), + datetime.datetime( + 2020, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + @override_settings(SCRATCH_DIR=SCRATCH) + def test_date_format_9(self): + text = ( + "lorem ipsum\n" + "27. Nullmonth 2020\n" + "März 2020\n" + "lorem ipsum" + ) + self.assertEqual( + parse_date("", text), + datetime.datetime( + 2020, 3, 1, 0, 0, + tzinfo=tz.gettz(settings.TIME_ZONE) + ) + ) + + def test_crazy_date_past(self, *args): + self.assertIsNone(parse_date("", "01-07-0590 00:00:00")) + + def test_crazy_date_future(self, *args): + self.assertIsNone(parse_date("", "01-07-2350 00:00:00")) + + def test_crazy_date_with_spaces(self, *args): + self.assertIsNone(parse_date("", "20 408000l 2475")) + + @override_settings(FILENAME_DATE_ORDER="YMD") + def test_filename_date_parse_invalid(self, *args): + self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py deleted file mode 100644 index 4d5ff07dd..000000000 --- a/src/paperless_tesseract/tests/test_date.py +++ /dev/null @@ -1,193 +0,0 @@ -import datetime -import os -import shutil -from unittest import mock -from uuid import uuid4 - -from dateutil import tz -from django.conf import settings -from django.test import TestCase, override_settings - -from ..parsers import RasterisedDocumentParser - - -class TestDate(TestCase): - - SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") - SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) - - def setUp(self): - os.makedirs(self.SCRATCH, exist_ok=True) - - def tearDown(self): - shutil.rmtree(self.SCRATCH) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_1(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = "lorem ipsum 130218 lorem ipsum" - self.assertEqual(document.get_date(), None) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_2(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = "lorem ipsum 2018 lorem ipsum" - self.assertEqual(document.get_date(), None) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_3(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = "lorem ipsum 20180213 lorem ipsum" - self.assertEqual(document.get_date(), None) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_4(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = "lorem ipsum 13.02.2018 lorem ipsum" - date = document.get_date() - self.assertEqual( - date, - datetime.datetime( - 2018, 2, 13, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_5(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = ( - "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " - "ipsum" - ) - date = document.get_date() - self.assertEqual( - date, - datetime.datetime( - 2018, 2, 13, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_6(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = ( - "lorem ipsum\n" - "Wohnort\n" - "3100\n" - "IBAN\n" - "AT87 4534\n" - "1234\n" - "1234 5678\n" - "BIC\n" - "lorem ipsum" - ) - self.assertEqual(document.get_date(), None) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_7(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = ( - "lorem ipsum\n" - "März 2019\n" - "lorem ipsum" - ) - date = document.get_date() - self.assertEqual( - date, - datetime.datetime( - 2019, 3, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_8(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = ( - "lorem ipsum\n" - "Wohnort\n" - "3100\n" - "IBAN\n" - "AT87 4534\n" - "1234\n" - "1234 5678\n" - "BIC\n" - "lorem ipsum\n" - "März 2020" - ) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2020, 3, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @override_settings(SCRATCH_DIR=SCRATCH) - def test_date_format_9(self): - input_file = os.path.join(self.SAMPLE_FILES, "") - document = RasterisedDocumentParser(input_file, None) - document._text = ( - "lorem ipsum\n" - "27. Nullmonth 2020\n" - "März 2020\n" - "lorem ipsum" - ) - self.assertEqual( - document.get_date(), - datetime.datetime( - 2020, 3, 1, 0, 0, - tzinfo=tz.gettz(settings.TIME_ZONE) - ) - ) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", - return_value="01-07-0590 00:00:00" - ) - @override_settings(SCRATCH_DIR=SCRATCH) - def test_crazy_date_past(self, *args): - document = RasterisedDocumentParser("/dev/null", None) - document.get_text() - self.assertIsNone(document.get_date()) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", - return_value="01-07-2350 00:00:00" - ) - @override_settings(SCRATCH_DIR=SCRATCH) - def test_crazy_date_future(self, *args): - document = RasterisedDocumentParser("/dev/null", None) - document.get_text() - self.assertIsNone(document.get_date()) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", - return_value="20 408000l 2475" - ) - @override_settings(SCRATCH_DIR=SCRATCH) - def test_crazy_date_with_spaces(self, *args): - document = RasterisedDocumentParser("/dev/null", None) - document.get_text() - self.assertIsNone(document.get_date()) - - @mock.patch( - "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", - return_value="No date in here" - ) - @override_settings(FILENAME_DATE_ORDER="YMD") - @override_settings(SCRATCH_DIR=SCRATCH) - def test_filename_date_parse_invalid(self, *args): - document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None) - document.get_text() - self.assertIsNone(document.get_date()) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index bc37b0b84..84363a18d 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -56,8 +56,8 @@ class TestAuxilliaryFunctions(TestCase): self.assertIsNone(text) def test_thumbnail(self): - parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) - parser.get_thumbnail() + parser = RasterisedDocumentParser(uuid.uuid4()) + parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), "application/pdf") # dont really know how to test it, just call it and assert that it does not raise anything. @mock.patch("paperless_tesseract.parsers.run_convert") @@ -71,6 +71,6 @@ class TestAuxilliaryFunctions(TestCase): m.side_effect = call_convert - parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) - parser.get_thumbnail() + parser = RasterisedDocumentParser(uuid.uuid4()) + parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), "application/pdf") # dont really know how to test it, just call it and assert that it does not raise anything.