Drop problematic tests

Some tests had differing outcomes depending on the version of Tesseract
installed on the test system.  This lead to a bunch of false test
failures, which lead to people (including me) just ignoring the Travis
results.

This commit removes those tests, and while it reduces our coverage, at
least the results are predictable.
This commit is contained in:
Daniel Quinn 2018-12-30 17:32:45 +00:00
parent 4a71c33537
commit 637b0d4cc2
20 changed files with 15 additions and 433 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

View File

@ -5,7 +5,7 @@ from unittest import mock
from uuid import uuid4 from uuid import uuid4
from dateutil import tz from dateutil import tz
from django.test import TestCase, override_settings from django.test import TestCase
from ..parsers import RasterisedDocumentParser from ..parsers import RasterisedDocumentParser
from django.conf import settings from django.conf import settings
@ -16,46 +16,36 @@ class TestDate(TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
MOCK_SCRATCH = "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH" # NOQA: E501
def setUp(self): def setUp(self):
os.makedirs(self.SCRATCH, exist_ok=True) os.makedirs(self.SCRATCH, exist_ok=True)
def tearDown(self): def tearDown(self):
shutil.rmtree(self.SCRATCH) shutil.rmtree(self.SCRATCH)
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_1(self): def test_date_format_1(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 130218 lorem ipsum" document._text = "lorem ipsum 130218 lorem ipsum"
self.assertEqual(document.get_date(), None) self.assertEqual(document.get_date(), None)
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_2(self): def test_date_format_2(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 2018 lorem ipsum" document._text = "lorem ipsum 2018 lorem ipsum"
self.assertEqual(document.get_date(), None) self.assertEqual(document.get_date(), None)
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_3(self): def test_date_format_3(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 20180213 lorem ipsum" document._text = "lorem ipsum 20180213 lorem ipsum"
self.assertEqual(document.get_date(), None) self.assertEqual(document.get_date(), None)
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_4(self): def test_date_format_4(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
@ -69,10 +59,7 @@ class TestDate(TestCase):
) )
) )
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_5(self): def test_date_format_5(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
@ -89,10 +76,7 @@ class TestDate(TestCase):
) )
) )
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_6(self): def test_date_format_6(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
@ -109,10 +93,7 @@ class TestDate(TestCase):
) )
self.assertEqual(document.get_date(), None) self.assertEqual(document.get_date(), None)
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_7(self): def test_date_format_7(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
@ -130,10 +111,7 @@ class TestDate(TestCase):
) )
) )
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_8(self): def test_date_format_8(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
@ -157,10 +135,7 @@ class TestDate(TestCase):
) )
) )
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_date_format_9(self): def test_date_format_9(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
@ -178,398 +153,11 @@ class TestDate(TestCase):
) )
) )
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_1_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
date = document.get_date()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_1_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_2_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_2_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="deu")
def test_get_text_3_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="deu")
def test_get_text_3_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="eng")
def test_get_text_4_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="eng")
def test_get_text_4_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_5_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_5_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_6_pdf_us(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
document.DATE_ORDER = "MDY"
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_6_png_us(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
document.DATE_ORDER = "MDY"
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_6_pdf_eu(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_6_png_eu(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_7_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_8_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_9_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_filename_date_1_pdf(self):
input_file = os.path.join(
self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.pdf"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
document.get_text()
date = document.get_date()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(
2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_filename_date_1_png(self):
input_file = os.path.join(
self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.png"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
date,
datetime.datetime(
2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_filename_date_2_pdf(self):
input_file = os.path.join(
self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.pdf"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(
2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_filename_date_2_png(self):
input_file = os.path.join(
self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.png"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
date,
datetime.datetime(
2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch( @mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text", "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00" return_value="01-07-0590 00:00:00"
) )
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_past(self, *args): def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null") document = RasterisedDocumentParser("/dev/null")
document.get_text() document.get_text()
@ -579,10 +167,7 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text", "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-2350 00:00:00" return_value="01-07-2350 00:00:00"
) )
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_future(self, *args): def test_crazy_date_future(self, *args):
document = RasterisedDocumentParser("/dev/null") document = RasterisedDocumentParser("/dev/null")
document.get_text() document.get_text()
@ -592,10 +177,7 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text", "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00" return_value="01-07-0590 00:00:00"
) )
@mock.patch( @mock.patch(MOCK_SCRATCH, SCRATCH)
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_crazy_date_past(self, *args): def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null") document = RasterisedDocumentParser("/dev/null")
document.get_text() document.get_text()