Conform everything to the coding standards

https://paperless.readthedocs.io/en/latest/contributing.html#additional-style-guides
This commit is contained in:
Daniel Quinn 2018-12-01 17:09:12 +00:00
parent 650db75c2b
commit d544f269e0
4 changed files with 168 additions and 97 deletions

View File

@ -92,11 +92,11 @@ text and matching algorithm. From the help info there:
uses a regex to match the PDF. If you don't know what a regex is, you uses a regex to match the PDF. If you don't know what a regex is, you
probably don't want this option. probably don't want this option.
When using the "any" or "all" matching algorithms, you can search for terms that When using the "any" or "all" matching algorithms, you can search for terms
consist of multiple words by enclosing them in double quotes. For example, defining that consist of multiple words by enclosing them in double quotes. For example,
a match text of ``"Bank of America" BofA`` using the "any" algorithm, will match defining a match text of ``"Bank of America" BofA`` using the "any" algorithm,
documents that contain either "Bank of America" or "BofA", but will not match will match documents that contain either "Bank of America" or "BofA", but will
documents containing "Bank of South America". not match documents containing "Bank of South America".
Then just save your tag/correspondent and run another document through the Then just save your tag/correspondent and run another document through the
consumer. Once complete, you should see the newly-created document, consumer. Once complete, you should see the newly-created document,

View File

@ -196,8 +196,9 @@ PAPERLESS_DEBUG="false"
#PAPERLESS_CONSUMER_LOOP_TIME=10 #PAPERLESS_CONSUMER_LOOP_TIME=10
# By default Paperless stops consuming a document if no language can be detected. # By default Paperless stops consuming a document if no language can be
# Set to true to consume documents even if the language detection fails. # detected. Set to true to consume documents even if the language detection
# fails.
#PAPERLESS_FORGIVING_OCR="false" #PAPERLESS_FORGIVING_OCR="false"

View File

@ -21,9 +21,9 @@ from django.utils import timezone
# - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
DATE_REGEX = re.compile( DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' + r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))' r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
) )
@ -80,15 +80,20 @@ class DocumentParser:
Returns the date of the document. Returns the date of the document.
""" """
def __parser__(ds, date_order): def __parser(ds, date_order):
""" """
Call dateparser.parse with a particular date ordering Call dateparser.parse with a particular date ordering
""" """
return dateparser.parse(ds, return dateparser.parse(
settings={"DATE_ORDER": date_order, ds,
"PREFER_DAY_OF_MONTH": "first", settings={
"RETURN_AS_TIMEZONE_AWARE": "DATE_ORDER": date_order,
True}) "PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True
}
)
date = None date = None
date_string = None date_string = None
@ -102,16 +107,18 @@ class DocumentParser:
date_string = m.group(0) date_string = m.group(0)
try: try:
date = __parser__(date_string, self.FILENAME_DATE_ORDER) date = __parser(date_string, self.FILENAME_DATE_ORDER)
except TypeError: except TypeError:
# Skip all matches that do not parse to a proper date # Skip all matches that do not parse to a proper date
continue continue
if date is not None and next_year > date.year > 1900: if date is not None and next_year > date.year > 1900:
self.log("info", self.log(
"Detected document date {} based on string {} " "info",
"from document title" "Detected document date {} based on string {} "
"".format(date.isoformat(), date_string)) "from document title"
"".format(date.isoformat(), date_string)
)
return date return date
try: try:
@ -126,7 +133,7 @@ class DocumentParser:
date_string = m.group(0) date_string = m.group(0)
try: try:
date = __parser__(date_string, self.DATE_ORDER) date = __parser(date_string, self.DATE_ORDER)
except TypeError: except TypeError:
# Skip all matches that do not parse to a proper date # Skip all matches that do not parse to a proper date
continue continue

View File

@ -63,8 +63,10 @@ class TestDate(TestCase):
date = document.get_date() date = document.get_date()
self.assertEqual( self.assertEqual(
date, date,
datetime.datetime(2018, 2, 13, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -76,12 +78,15 @@ class TestDate(TestCase):
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document._text = ( document._text = (
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
"ipsum") "ipsum"
)
date = document.get_date() date = document.get_date()
self.assertEqual( self.assertEqual(
date, date,
datetime.datetime(2018, 2, 13, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 2, 13, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -119,8 +124,10 @@ class TestDate(TestCase):
date = document.get_date() date = document.get_date()
self.assertEqual( self.assertEqual(
date, date,
datetime.datetime(2019, 3, 1, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2019, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -130,20 +137,25 @@ class TestDate(TestCase):
def test_date_format_8(self): def test_date_format_8(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document._text = ("lorem ipsum\n" document._text = (
"Wohnort\n" "lorem ipsum\n"
"3100\n" "Wohnort\n"
"IBAN\n" "3100\n"
"AT87 4534\n" "IBAN\n"
"1234\n" "AT87 4534\n"
"1234 5678\n" "1234\n"
"BIC\n" "1234 5678\n"
"lorem ipsum\n" "BIC\n"
"März 2020") "lorem ipsum\n"
self.assertEqual(document.get_date(), "März 2020"
datetime.datetime(2020, 3, 1, 0, 0, )
tzinfo=tz.gettz( self.assertEqual(
settings.TIME_ZONE))) document.get_date(),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch( @mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@ -152,14 +164,19 @@ class TestDate(TestCase):
def test_date_format_9(self): def test_date_format_9(self):
input_file = os.path.join(self.SAMPLE_FILES, "") input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document._text = ("lorem ipsum\n" document._text = (
"27. Nullmonth 2020\n" "lorem ipsum\n"
"März 2020\n" "27. Nullmonth 2020\n"
"lorem ipsum") "März 2020\n"
self.assertEqual(document.get_date(), "lorem ipsum"
datetime.datetime(2020, 3, 1, 0, 0, )
tzinfo=tz.gettz( self.assertEqual(
settings.TIME_ZONE))) document.get_date(),
datetime.datetime(
2020, 3, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
)
@mock.patch( @mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@ -174,8 +191,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
date, date,
datetime.datetime(2018, 4, 1, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -190,8 +209,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False) self.assertEqual(document._is_ocred(), False)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -206,8 +227,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -222,8 +245,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False) self.assertEqual(document._is_ocred(), False)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2013, 2, 1, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2013, 2, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -239,8 +264,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -256,8 +283,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False) self.assertEqual(document._is_ocred(), False)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -273,8 +302,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -290,8 +321,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False) self.assertEqual(document._is_ocred(), False)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 10, 5, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 10, 5, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -306,8 +339,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -322,8 +357,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False) self.assertEqual(document._is_ocred(), False)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -338,8 +375,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -354,8 +393,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), False) self.assertEqual(document._is_ocred(), False)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 12, 17, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 12, 17, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -394,8 +435,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2018, 4, 1, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 4, 1, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -410,8 +453,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -426,8 +471,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
document.get_date(), document.get_date(),
datetime.datetime(2017, 12, 31, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2017, 12, 31, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -435,8 +482,10 @@ class TestDate(TestCase):
SCRATCH SCRATCH
) )
def test_filename_date_1_pdf(self): def test_filename_date_1_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, input_file = os.path.join(
"tests_date_in_filename_2018-03-20_1.pdf") self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.pdf"
)
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD' document.FILENAME_DATE_ORDER = 'YMD'
document.get_text() document.get_text()
@ -444,8 +493,10 @@ class TestDate(TestCase):
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
date, date,
datetime.datetime(2018, 3, 20, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -453,16 +504,20 @@ class TestDate(TestCase):
SCRATCH SCRATCH
) )
def test_filename_date_1_png(self): def test_filename_date_1_png(self):
input_file = os.path.join(self.SAMPLE_FILES, input_file = os.path.join(
"tests_date_in_filename_2018-03-20_1.png") self.SAMPLE_FILES,
"tests_date_in_filename_2018-03-20_1.png"
)
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD' document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date() date = document.get_date()
self.assertEqual(document._is_ocred(), False) self.assertEqual(document._is_ocred(), False)
self.assertEqual( self.assertEqual(
date, date,
datetime.datetime(2018, 3, 20, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2018, 3, 20, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -470,16 +525,20 @@ class TestDate(TestCase):
SCRATCH SCRATCH
) )
def test_filename_date_2_pdf(self): def test_filename_date_2_pdf(self):
input_file = os.path.join(self.SAMPLE_FILES, input_file = os.path.join(
"2013-12-11_tests_date_in_filename_2.pdf") self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.pdf"
)
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD' document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date() date = document.get_date()
self.assertEqual(document._is_ocred(), True) self.assertEqual(document._is_ocred(), True)
self.assertEqual( self.assertEqual(
date, date,
datetime.datetime(2013, 12, 11, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(
@ -487,16 +546,20 @@ class TestDate(TestCase):
SCRATCH SCRATCH
) )
def test_filename_date_2_png(self): def test_filename_date_2_png(self):
input_file = os.path.join(self.SAMPLE_FILES, input_file = os.path.join(
"2013-12-11_tests_date_in_filename_2.png") self.SAMPLE_FILES,
"2013-12-11_tests_date_in_filename_2.png"
)
document = RasterisedDocumentParser(input_file) document = RasterisedDocumentParser(input_file)
document.FILENAME_DATE_ORDER = 'YMD' document.FILENAME_DATE_ORDER = 'YMD'
date = document.get_date() date = document.get_date()
self.assertEqual(document._is_ocred(), False) self.assertEqual(document._is_ocred(), False)
self.assertEqual( self.assertEqual(
date, date,
datetime.datetime(2013, 12, 11, 0, 0, datetime.datetime(
tzinfo=tz.gettz(settings.TIME_ZONE)) 2013, 12, 11, 0, 0,
tzinfo=tz.gettz(settings.TIME_ZONE)
)
) )
@mock.patch( @mock.patch(