Drop problematic tests

Some tests had differing outcomes depending on the version of Tesseract
installed on the test system.  This lead to a bunch of false test
failures, which lead to people (including me) just ignoring the Travis
results.

This commit removes those tests, and while it reduces our coverage, at
least the results are predictable.
This commit is contained in:
Daniel Quinn 2018-12-30 17:32:45 +00:00
parent 4a71c33537
commit 637b0d4cc2
20 changed files with 15 additions and 433 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 136 KiB

View File

@ -5,7 +5,7 @@ from unittest import mock
from uuid import uuid4
from dateutil import tz
from django.test import TestCase, override_settings
from django.test import TestCase
from ..parsers import RasterisedDocumentParser
from django.conf import settings
@ -16,46 +16,36 @@ class TestDate(TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
MOCK_SCRATCH = "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH" # NOQA: E501
def setUp(self):
os.makedirs(self.SCRATCH, exist_ok=True)
def tearDown(self):
shutil.rmtree(self.SCRATCH)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_1(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 130218 lorem ipsum"
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_2(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 2018 lorem ipsum"
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_3(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = "lorem ipsum 20180213 lorem ipsum"
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_4(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
@ -69,10 +59,7 @@ class TestDate(TestCase):
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_5(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
@ -89,10 +76,7 @@ class TestDate(TestCase):
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_6(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
@ -109,10 +93,7 @@ class TestDate(TestCase):
)
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_7(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
@ -130,10 +111,7 @@ class TestDate(TestCase):
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_8(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
@ -157,10 +135,7 @@ class TestDate(TestCase):
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_9(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
@ -178,398 +153,11 @@ class TestDate(TestCase):
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_1_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
date = document.get_date()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_1_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_2_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_2_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="deu")
def test_get_text_3_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="deu")
def test_get_text_3_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="eng")
def test_get_text_4_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@override_settings(OCR_LANGUAGE="eng")
def test_get_text_4_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_5_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_5_png(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_6_pdf_us(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
document = RasterisedDocumentParser(input_file)
document.get_text()
document.DATE_ORDER = "MDY"
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_6_png_us(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
document = RasterisedDocumentParser(input_file)
document.get_text()
document.DATE_ORDER = "MDY"
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_6_pdf_eu(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_6_png_eu(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(document.get_date(), None)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_7_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_8_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_get_text_9_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
document = RasterisedDocumentParser(input_file)
document.DATE_ORDER = 'DMY'
document.get_text()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(
2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_filename_date_1_pdf(self):
input_file = os.path.join(
self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.pdf"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
document.get_text()
date = document.get_date()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(
2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_filename_date_1_png(self):
input_file = os.path.join(
self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.png"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
date,
datetime.datetime(
2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_filename_date_2_pdf(self):
input_file = os.path.join(
self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.pdf"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(
2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
def test_filename_date_2_png(self):
input_file = os.path.join(
self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.png"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
date,
datetime.datetime(
2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
@ -579,10 +167,7 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-2350 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_crazy_date_future(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
@ -592,10 +177,7 @@ class TestDate(TestCase):
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
return_value="01-07-0590 00:00:00"
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SCRATCH
)
@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()