mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-11 10:00:48 -05:00

It turns out that the Lorem ipsum text in the sample files was confuing the language guesser, causing it to think the file was in Catalan and not English or German.
430 lines
14 KiB
Python
430 lines
14 KiB
Python
import datetime
|
|
import os
|
|
import shutil
|
|
from unittest import mock
|
|
from uuid import uuid4
|
|
|
|
from dateutil import tz
|
|
from django.test import TestCase, override_settings
|
|
|
|
from ..parsers import RasterisedDocumentParser
|
|
|
|
|
|
class TestDate(TestCase):
|
|
|
|
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
|
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
|
|
|
def setUp(self):
|
|
os.makedirs(self.SCRATCH, exist_ok=True)
|
|
|
|
def tearDown(self):
|
|
shutil.rmtree(self.SCRATCH)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_1(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = "lorem ipsum 130218 lorem ipsum"
|
|
self.assertEqual(document.get_date(), None)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_2(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = "lorem ipsum 2018 lorem ipsum"
|
|
self.assertEqual(document.get_date(), None)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_3(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = "lorem ipsum 20180213 lorem ipsum"
|
|
self.assertEqual(document.get_date(), None)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_4(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = "lorem ipsum 13.02.2018 lorem ipsum"
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_5(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = (
|
|
"lorem ipsum 130218, 2018, 20180213 and 13.02.2018 lorem ipsum")
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_6(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = (
|
|
"lorem ipsum\n"
|
|
"Wohnort\n"
|
|
"3100\n"
|
|
"IBAN\n"
|
|
"AT87 4534\n"
|
|
"1234\n"
|
|
"1234 5678\n"
|
|
"BIC\n"
|
|
"lorem ipsum"
|
|
)
|
|
self.assertEqual(document.get_date(), None)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_7(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = (
|
|
"lorem ipsum\n"
|
|
"März 2019\n"
|
|
"lorem ipsum"
|
|
)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_8(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = ("lorem ipsum\n"
|
|
"Wohnort\n"
|
|
"3100\n"
|
|
"IBAN\n"
|
|
"AT87 4534\n"
|
|
"1234\n"
|
|
"1234 5678\n"
|
|
"BIC\n"
|
|
"lorem ipsum\n"
|
|
"März 2020")
|
|
self.assertEqual(document.get_date(),
|
|
datetime.datetime(2020, 3, 1, 0, 0,
|
|
tzinfo=tz.tzutc()))
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_date_format_9(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document._text = ("lorem ipsum\n"
|
|
"27. Nullmonth 2020\n"
|
|
"März 2020\n"
|
|
"lorem ipsum")
|
|
self.assertEqual(document.get_date(),
|
|
datetime.datetime(2020, 3, 1, 0, 0,
|
|
tzinfo=tz.tzutc()))
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_1_pdf(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_1_png(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), False)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_2_pdf(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_2_png(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), False)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
@override_settings(OCR_LANGUAGE="deu")
|
|
def test_get_text_3_pdf(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
@override_settings(OCR_LANGUAGE="deu")
|
|
def test_get_text_3_png(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), False)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
@override_settings(OCR_LANGUAGE="eng")
|
|
def test_get_text_4_pdf(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
@override_settings(OCR_LANGUAGE="eng")
|
|
def test_get_text_4_png(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), False)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_5_pdf(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_5_png(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), False)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_6_pdf_us(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
document.DATE_ORDER = "MDY"
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_6_png_us(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
document.DATE_ORDER = "MDY"
|
|
self.assertEqual(document._is_ocred(), False)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_6_pdf_eu(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(document.get_date(), None)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_6_png_eu(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), False)
|
|
self.assertEqual(document.get_date(), None)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_7_pdf(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_8_pdf(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_get_text_9_pdf(self):
|
|
input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
|
|
document = RasterisedDocumentParser(input_file)
|
|
document.get_text()
|
|
self.assertEqual(document._is_ocred(), True)
|
|
self.assertEqual(
|
|
document.get_date(),
|
|
datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
|
|
)
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
|
return_value="01-07-0590 00:00:00"
|
|
)
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_crazy_date_past(self, *args):
|
|
document = RasterisedDocumentParser("/dev/null")
|
|
document.get_text()
|
|
self.assertIsNone(document.get_date())
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
|
return_value="01-07-2350 00:00:00"
|
|
)
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_crazy_date_future(self, *args):
|
|
document = RasterisedDocumentParser("/dev/null")
|
|
document.get_text()
|
|
self.assertIsNone(document.get_date())
|
|
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
|
return_value="01-07-0590 00:00:00"
|
|
)
|
|
@mock.patch(
|
|
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
|
SCRATCH
|
|
)
|
|
def test_crazy_date_past(self, *args):
|
|
document = RasterisedDocumentParser("/dev/null")
|
|
document.get_text()
|
|
self.assertIsNone(document.get_date())
|