Conform everything to the coding standards

https://paperless.readthedocs.io/en/latest/contributing.html#additional-style-guides
This commit is contained in:
Daniel Quinn 2018-12-01 17:09:12 +00:00
parent 650db75c2b
commit d544f269e0
4 changed files with 168 additions and 97 deletions

View File

@ -92,11 +92,11 @@ text and matching algorithm. From the help info there:
uses a regex to match the PDF. If you don't know what a regex is, you
probably don't want this option.
When using the "any" or "all" matching algorithms, you can search for terms that
consist of multiple words by enclosing them in double quotes. For example, defining
a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match
documents that contain either "Bank of America" or "BofA", but will not match
documents containing "Bank of South America".
When using the "any" or "all" matching algorithms, you can search for terms
that consist of multiple words by enclosing them in double quotes. For example,
defining a match text of ``"Bank of America" BofA`` using the "any" algorithm,
will match documents that contain either "Bank of America" or "BofA", but will
not match documents containing "Bank of South America".
Then just save your tag/correspondent and run another document through the
consumer. Once complete, you should see the newly-created document,

View File

@ -196,8 +196,9 @@ PAPERLESS_DEBUG="false"
#PAPERLESS_CONSUMER_LOOP_TIME=10
# By default Paperless stops consuming a document if no language can be detected.
# Set to true to consume documents even if the language detection fails.
# By default Paperless stops consuming a document if no language can be
# detected. Set to true to consume documents even if the language detection
# fails.
#PAPERLESS_FORGIVING_OCR="false"

View File

@ -21,9 +21,9 @@ from django.utils import timezone
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
)
@ -80,15 +80,20 @@ class DocumentParser:
Returns the date of the document.
"""
def __parser__(ds, date_order):
def __parser(ds, date_order):
"""
Call dateparser.parse with a particular date ordering
"""
return dateparser.parse(ds,
settings={"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True})
return dateparser.parse(
ds,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True
}
)
date = None
date_string = None
@ -102,16 +107,18 @@ class DocumentParser:
date_string = m.group(0)
try:
date = __parser__(date_string, self.FILENAME_DATE_ORDER)
date = __parser(date_string, self.FILENAME_DATE_ORDER)
except TypeError:
# Skip all matches that do not parse to a proper date
continue
if date is not None and next_year > date.year > 1900:
self.log("info",
"Detected document date {} based on string {} "
"from document title"
"".format(date.isoformat(), date_string))
self.log(
"info",
"Detected document date {} based on string {} "
"from document title"
"".format(date.isoformat(), date_string)
)
return date
try:
@ -126,7 +133,7 @@ class DocumentParser:
date_string = m.group(0)
try:
date = __parser__(date_string, self.DATE_ORDER)
date = __parser(date_string, self.DATE_ORDER)
except TypeError:
# Skip all matches that do not parse to a proper date
continue

View File

@ -63,8 +63,10 @@ class TestDate(TestCase):
date = document.get_date()
self.assertEqual(
date,
datetime.datetime(2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -76,12 +78,15 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file)
document._text = (
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
"ipsum")
"ipsum"
)
date = document.get_date()
self.assertEqual(
date,
datetime.datetime(2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -119,8 +124,10 @@ class TestDate(TestCase):
date = document.get_date()
self.assertEqual(
date,
datetime.datetime(2019, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2019, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -130,20 +137,25 @@ class TestDate(TestCase):
def test_date_format_8(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = ("lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum\n"
"März 2020")
self.assertEqual(document.get_date(),
datetime.datetime(2020, 3, 1, 0, 0,
tzinfo=tz.gettz(
settings.TIME_ZONE)))
document._text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum\n"
"März 2020"
)
self.assertEqual(
document.get_date(),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@ -152,14 +164,19 @@ class TestDate(TestCase):
def test_date_format_9(self):
input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = ("lorem ipsum\n"
"27. Nullmonth 2020\n"
"März 2020\n"
"lorem ipsum")
self.assertEqual(document.get_date(),
datetime.datetime(2020, 3, 1, 0, 0,
tzinfo=tz.gettz(
settings.TIME_ZONE)))
document._text = (
"lorem ipsum\n"
"27. Nullmonth 2020\n"
"März 2020\n"
"lorem ipsum"
)
self.assertEqual(
document.get_date(),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@ -174,8 +191,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -190,8 +209,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -206,8 +227,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -222,8 +245,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -239,8 +264,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -256,8 +283,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -273,8 +302,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -290,8 +321,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -306,8 +339,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -322,8 +357,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -338,8 +375,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -354,8 +393,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -394,8 +435,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -410,8 +453,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -426,8 +471,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -435,8 +482,10 @@ class TestDate(TestCase):
SCRATCH
)
def test_filename_date_1_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.pdf")
input_file = os.path.join(
self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.pdf"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
document.get_text()
@ -444,8 +493,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -453,16 +504,20 @@ class TestDate(TestCase):
SCRATCH
)
def test_filename_date_1_png(self):
input_file = os.path.join(self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.png")
input_file = os.path.join(
self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.png"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
date,
datetime.datetime(2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -470,16 +525,20 @@ class TestDate(TestCase):
SCRATCH
)
def test_filename_date_2_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.pdf")
input_file = os.path.join(
self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.pdf"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), True)
self.assertEqual(
date,
datetime.datetime(2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(
@ -487,16 +546,20 @@ class TestDate(TestCase):
SCRATCH
)
def test_filename_date_2_png(self):
input_file = os.path.join(self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.png")
input_file = os.path.join(
self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.png"
)
document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date()
self.assertEqual(document._is_ocred(), False)
self.assertEqual(
date,
datetime.datetime(2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE))
datetime.datetime(
2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch(