Merge remote-tracking branch 'upstream/master'

2025-12-14 01:21:14 -06:00 · 2018-12-11 12:06:15 +01:00
parent 0f2a79ea61 fa783e4f4a
commit 8f0d53c54a
46 changed files with 1171 additions and 695 deletions
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -64,12 +64,12 @@ class FinancialYearFilter(admin.SimpleListFilter):

            # To keep it simple we use the same string for both
            # query parameter and the display.
-            return (query, query)
+            return query, query

        else:
            query = "{0}-{0}".format(date.year)
            display = "{}".format(date.year)
-            return (query, display)
+            return query, display

    def lookups(self, request, model_admin):
        if not settings.FY_START or not settings.FY_END:
@@ -91,25 +91,24 @@ class FinancialYearFilter(admin.SimpleListFilter):


 class RecentCorrespondentFilter(admin.RelatedFieldListFilter):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.title = "correspondent (recent)"
+    """
+    If PAPERLESS_RECENT_CORRESPONDENT_YEARS is set, we limit the available
+    correspondents to documents sent our way over the past ``n`` years.
+    """

    def field_choices(self, field, request, model_admin):

        years = settings.PAPERLESS_RECENT_CORRESPONDENT_YEARS
-        days = 365 * years
+        correspondents = Correspondent.objects.all()

-        lookups = []
        if years and years > 0:
-            correspondents = Correspondent.objects.filter(
+            self.title = "Correspondent (Recent)"
+            days = 365 * years
+            correspondents = correspondents.filter(
                documents__created__gte=datetime.now() - timedelta(days=days)
            ).distinct()
-            for c in correspondents:
-                lookups.append((c.id, c.name))

-        return lookups
+        return [(c.id, c.name) for c in correspondents]


 class CommonAdmin(admin.ModelAdmin):
@@ -124,7 +123,9 @@ class CorrespondentAdmin(CommonAdmin):
        "document_count",
        "last_correspondence"
    )
-    list_editable = ("automatic_classification")
+    list_editable = ("automatic_classification",)
+
+    readonly_fields = ("slug",)

    def get_queryset(self, request):
        qs = super(CorrespondentAdmin, self).get_queryset(request)
@@ -149,6 +150,11 @@ class TagAdmin(CommonAdmin):
    list_filter = ("colour",)
    list_editable = ("colour", "automatic_classification")

+    readonly_fields = ("slug",)
+
+    class Media:
+        js = ("js/colours.js",)
+
    def get_queryset(self, request):
        qs = super(TagAdmin, self).get_queryset(request)
        qs = qs.annotate(document_count=models.Count("documents"))
@@ -164,6 +170,8 @@ class DocumentTypeAdmin(CommonAdmin):
    list_display = ("name", "automatic_classification", "document_count")
    list_editable = ("automatic_classification",)

+    readonly_fields = ("slug",)
+
    def get_queryset(self, request):
        qs = super(DocumentTypeAdmin, self).get_queryset(request)
        qs = qs.annotate(document_count=models.Count("documents"))
@@ -182,14 +190,13 @@ class DocumentAdmin(CommonAdmin):
        }

    search_fields = ("correspondent__name", "title", "content", "tags__name")
-    readonly_fields = ("added",)
+    readonly_fields = ("added", "file_type", "storage_type",)
    list_display = ("title", "created", "added", "thumbnail", "correspondent",
                    "tags_", "archive_serial_number", "document_type")
    list_filter = (
        "document_type",
        "tags",
        ("correspondent", RecentCorrespondentFilter),
-        "correspondent",
        FinancialYearFilter
    )

--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,3 +1,4 @@
+from django.db import transaction
 import datetime
 import hashlib
 import logging
@@ -111,8 +112,11 @@ class Consumer:
                if not self.try_consume_file(file):
                    self._ignore.append((file, mtime))

+    @transaction.atomic
    def try_consume_file(self, file):
-        "Return True if file was consumed"
+        """
+        Return True if file was consumed
+        """

        if not re.match(FileInfo.REGEXES["title"], file):
            return False
@@ -145,7 +149,7 @@ class Consumer:
        parsed_document = parser_class(doc)

        try:
-            thumbnail = parsed_document.get_thumbnail()
+            thumbnail = parsed_document.get_optimised_thumbnail()
            date = parsed_document.get_date()
            document = self._store(
                parsed_document.get_text(),
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -1,4 +1,4 @@
-from django_filters.rest_framework import CharFilter, FilterSet, BooleanFilter, ModelChoiceFilter
+from django_filters.rest_framework import BooleanFilter, FilterSet

 from .models import Correspondent, Document, Tag, DocumentType

--- a/src/documents/migrations/0022_auto_20181007_1420.py
+++ b/src/documents/migrations/0022_auto_20181007_1420.py
@@ -0,0 +1,52 @@
+# Generated by Django 2.0.8 on 2018-10-07 14:20
+
+from django.db import migrations, models
+from django.utils.text import slugify
+
+
+def re_slug_all_the_things(apps, schema_editor):
+    """
+    Rewrite all slug values to make sure they're actually slugs before we brand
+    them as uneditable.
+    """
+
+    Tag = apps.get_model("documents", "Tag")
+    Correspondent = apps.get_model("documents", "Correspondent")
+
+    for klass in (Tag, Correspondent):
+        for instance in klass.objects.all():
+            klass.objects.filter(
+                pk=instance.pk
+            ).update(
+                slug=slugify(instance.slug)
+            )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0021_document_storage_type'),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name='tag',
+            options={'ordering': ('name',)},
+        ),
+        migrations.AlterField(
+            model_name='correspondent',
+            name='slug',
+            field=models.SlugField(blank=True, editable=False),
+        ),
+        migrations.AlterField(
+            model_name='document',
+            name='file_type',
+            field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')], editable=False, max_length=4),
+        ),
+        migrations.AlterField(
+            model_name='tag',
+            name='slug',
+            field=models.SlugField(blank=True, editable=False),
+        ),
+        migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
+    ]
--- a/src/documents/migrations/1001_workflow_improvements.py
+++ b/src/documents/migrations/1001_workflow_improvements.py
@@ -6,7 +6,7 @@ from django.db import migrations, models
 class Migration(migrations.Migration):

    dependencies = [
-        ('documents', '0021_document_storage_type'),
+        ('documents', '0022_auto_20181007_1420'),
    ]

    operations = [
--- a/src/documents/migrations/1002_auto_20180823_1155.py
+++ b/src/documents/migrations/1002_auto_20180823_1155.py
@@ -7,7 +7,7 @@ import django.db.models.deletion
 class Migration(migrations.Migration):

    dependencies = [
-        ('documents', '0022_workflow_improvements'),
+        ('documents', '1001_workflow_improvements'),
    ]

    operations = [
--- a/src/documents/migrations/1003_auto_20180904_1425.py
+++ b/src/documents/migrations/1003_auto_20180904_1425.py
@@ -18,7 +18,7 @@ def reverse_automatic_classification(apps, schema_editor):
 class Migration(migrations.Migration):

    dependencies = [
-        ('documents', '0023_auto_20180823_1155'),
+        ('documents', '1002_auto_20180823_1155'),
    ]

    operations = [
--- a/src/documents/migrations/1004_documenttype_slug.py
+++ b/src/documents/migrations/1004_documenttype_slug.py
@@ -0,0 +1,36 @@
+# Generated by Django 2.0.8 on 2018-10-07 14:20
+
+from django.db import migrations, models
+from django.utils.text import slugify
+
+
+def re_slug_all_the_things(apps, schema_editor):
+    """
+    Rewrite all slug values to make sure they're actually slugs before we brand
+    them as uneditable.
+    """
+
+    DocumentType = apps.get_model("documents", "DocumentType")
+
+    for instance in DocumentType.objects.all():
+        DocumentType.objects.filter(
+            pk=instance.pk
+        ).update(
+            slug=slugify(instance.slug)
+        )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1003_auto_20180904_1425'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='documenttype',
+            name='slug',
+            field=models.SlugField(blank=True, editable=False),
+        ),
+        migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -11,6 +11,7 @@ from django.conf import settings
 from django.db import models
 from django.template.defaultfilters import slugify
 from django.utils import timezone
+from django.utils.text import slugify
 from fuzzywuzzy import fuzz

 from .managers import LogManager
@@ -24,7 +25,7 @@ except ImportError:
 class MatchingModel(models.Model):

    name = models.CharField(max_length=128, unique=True)
-    slug = models.SlugField(blank=True)
+    slug = models.SlugField(blank=True, editable=False)

    automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')

@@ -37,8 +38,7 @@ class MatchingModel(models.Model):

    def save(self, *args, **kwargs):

-        if not self.slug:
-            self.slug = slugify(self.name)
+        self.slug = slugify(self.name)

        models.Model.save(self, *args, **kwargs)

@@ -369,7 +369,7 @@ class FileInfo:
        r = []
        for t in tags.split(","):
            r.append(Tag.objects.get_or_create(
-                slug=t.lower(),
+                slug=slugify(t),
                defaults={"name": t}
            )[0])
        return tuple(r)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -1,23 +1,31 @@
 import logging
-import shutil
-import tempfile
+import os
 import re
+import shutil
+import subprocess
+import tempfile

+import dateparser
 from django.conf import settings
+from django.utils import timezone

 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
 # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 # - MONTH ZZZZ, with ZZZZ being 4 digits
 # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 DATE_REGEX = re.compile(
-    r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
-    r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
-    r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
-    r'\b([^\W\d_]{3,9} [0-9]{4})\b'
+    r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +  # NOQA: E501
+    r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +  # NOQA: E501
+    r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' +  # NOQA: E501
+    r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
+    r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
 )


@@ -32,6 +40,9 @@ class DocumentParser:
    """

    SCRATCH = settings.SCRATCH_DIR
+    DATE_ORDER = settings.DATE_ORDER
+    FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER
+    OPTIPNG = settings.OPTIPNG_BINARY

    def __init__(self, path):
        self.document_path = path
@@ -45,6 +56,19 @@ class DocumentParser:
        """
        raise NotImplementedError()

+    def optimise_thumbnail(self, in_path):
+
+        out_path = os.path.join(self.tempdir, "optipng.png")
+
+        args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
+        if not subprocess.Popen(args).wait() == 0:
+            raise ParseError("Optipng failed at {}".format(args))
+
+        return out_path
+
+    def get_optimised_thumbnail(self):
+        return self.optimise_thumbnail(self.get_thumbnail())
+
    def get_text(self):
        """
        Returns the text from the document and only the text.
@@ -55,7 +79,82 @@ class DocumentParser:
        """
        Returns the date of the document.
        """
-        raise NotImplementedError()
+
+        def __parser(ds, date_order):
+            """
+            Call dateparser.parse with a particular date ordering
+            """
+            return dateparser.parse(
+                ds,
+                settings={
+                    "DATE_ORDER": date_order,
+                    "PREFER_DAY_OF_MONTH": "first",
+                    "RETURN_AS_TIMEZONE_AWARE":
+                    True
+                }
+            )
+
+        date = None
+        date_string = None
+
+        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
+        title = os.path.basename(self.document_path)
+
+        # if filename date parsing is enabled, search there first:
+        if self.FILENAME_DATE_ORDER:
+            self.log("info", "Checking document title for date")
+            for m in re.finditer(DATE_REGEX, title):
+                date_string = m.group(0)
+
+                try:
+                    date = __parser(date_string, self.FILENAME_DATE_ORDER)
+                except TypeError:
+                    # Skip all matches that do not parse to a proper date
+                    continue
+
+                if date is not None and next_year > date.year > 1900:
+                    self.log(
+                        "info",
+                        "Detected document date {} based on string {} "
+                        "from document title"
+                        "".format(date.isoformat(), date_string)
+                    )
+                    return date
+
+        try:
+            # getting text after checking filename will save time if only
+            # looking at the filename instead of the whole text
+            text = self.get_text()
+        except ParseError:
+            return None
+
+        # Iterate through all regex matches in text and try to parse the date
+        for m in re.finditer(DATE_REGEX, text):
+            date_string = m.group(0)
+
+            try:
+                date = __parser(date_string, self.DATE_ORDER)
+            except TypeError:
+                # Skip all matches that do not parse to a proper date
+                continue
+
+            if date is not None and next_year > date.year > 1900:
+                break
+            else:
+                date = None
+
+        if date is not None:
+            self.log(
+                "info",
+                "Detected document date {} based on string {}".format(
+                    date.isoformat(),
+                    date_string
+                )
+            )
+        else:
+            self.log("info", "Unable to detect date for document")
+
+        return date

    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
--- a/src/documents/static/js/colours.js
+++ b/src/documents/static/js/colours.js
@@ -0,0 +1,66 @@
+// The following jQuery snippet will add a small square next to the selection
+// drop-down on the `Add tag` page that will update to show the selected tag
+// color as the drop-down value is changed.
+
+django.jQuery(document).ready(function(){
+
+  if (django.jQuery("#id_colour").length) {
+
+    let colour;
+    let colour_num;
+
+    colour_num = django.jQuery("#id_colour").val() - 1;
+    colour = django.jQuery('#id_colour')[0][colour_num].text;
+    django.jQuery('#id_colour').after('<div class="colour_square"></div>');
+
+    django.jQuery('.colour_square').css({
+      'float': 'left',
+      'width': '20px',
+      'height': '20px',
+      'margin': '5px',
+      'border': '1px solid rgba(0, 0, 0, .2)',
+      'background': colour
+    });
+
+    django.jQuery('#id_colour').change(function () {
+      colour_num = django.jQuery("#id_colour").val() - 1;
+      colour = django.jQuery('#id_colour')[0][colour_num].text;
+      django.jQuery('.colour_square').css({'background': colour});
+    });
+
+  } else if (django.jQuery("select[id*='colour']").length) {
+
+    django.jQuery('select[id*="-colour"]').each(function (index, element) {
+      let id;
+      let loop_colour_num;
+      let loop_colour;
+
+      id = "colour_square_" + index;
+      django.jQuery(element).after('<div class="colour_square" id="' + id + '"></div>');
+
+      loop_colour_num = django.jQuery(element).val() - 1;
+      loop_colour = django.jQuery(element)[0][loop_colour_num].text;
+
+      django.jQuery("<style type='text/css'>\
+                        .colour_square{ \
+                            float: left; \
+                            width: 20px; \
+                            height: 20px; \
+                            margin: 5px; \
+                            border: 1px solid rgba(0,0,0,.2); \
+                        } </style>").appendTo("head");
+      django.jQuery('#' + id).css({'background': loop_colour});
+
+      console.log(id, loop_colour_num, loop_colour);
+
+      django.jQuery(element).change(function () {
+        loop_colour_num = django.jQuery(element).val() - 1;
+        loop_colour = django.jQuery(element)[0][loop_colour_num].text;
+        django.jQuery('#' + id).css({'background': loop_colour});
+        console.log('#' + id, loop_colour)
+      });
+    })
+
+  }
+
+});
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -76,7 +76,12 @@ def binaries_check(app_configs, **kwargs):
    error = "Paperless can't find {}. Without it, consumption is impossible."
    hint = "Either it's not in your ${PATH} or it's not installed."

-    binaries = (settings.CONVERT_BINARY, settings.UNPAPER_BINARY, "tesseract")
+    binaries = (
+        settings.CONVERT_BINARY,
+        settings.OPTIPNG_BINARY,
+        settings.UNPAPER_BINARY,
+        "tesseract"
+    )

    check_messages = []
    for binary in binaries:
--- a/src/paperless/models.py
+++ b/src/paperless/models.py
@@ -1,15 +1,20 @@
+from django.contrib.auth.models import User as DjangoUser
+
+
 class User:
    """
-      This is a dummy django User used with our middleware to disable
-      login authentication if that is configured in paperless.conf
+    This is a dummy django User used with our middleware to disable
+    login authentication if that is configured in paperless.conf
    """
+
    is_superuser = True
    is_active = True
    is_staff = True
    is_authenticated = True

-    # Must be -1 to avoid colliding with real user ID's (which start at 1)
-    id = -1
+    @property
+    def id(self):
+        return DjangoUser.objects.order_by("pk").first().pk

    @property
    def pk(self):
@@ -17,9 +22,9 @@ class User:


 """
-  NOTE: These are here as a hack instead of being in the User definition
-  above due to the way pycodestyle handles lamdbdas.
-  See https://github.com/PyCQA/pycodestyle/issues/379 for more.
+NOTE: These are here as a hack instead of being in the User definition
+NOTE: above due to the way pycodestyle handles lamdbdas.
+NOTE: See https://github.com/PyCQA/pycodestyle/issues/379 for more.
 """

 User.has_module_perms = lambda *_: True
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -152,6 +152,10 @@ if os.getenv("PAPERLESS_DBENGINE"):
    }
    if os.getenv("PAPERLESS_DBPASS"):
        DATABASES["default"]["PASSWORD"] = os.getenv("PAPERLESS_DBPASS")
+    if os.getenv("PAPERLESS_DBHOST"):
+        DATABASES["default"]["HOST"] = os.getenv("PAPERLESS_DBHOST")
+    if os.getenv("PAPERLESS_DBPORT"):
+        DATABASES["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT")


 # Password validation
@@ -199,6 +203,16 @@ STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", "/static/")
 MEDIA_URL = os.getenv("PAPERLESS_MEDIA_URL", "/media/")


+# Other
+
+# Disable Django's artificial limit on the number of form fields to submit at
+# once.  This is a protection against overloading the server, but since this is
+# a self-hosted sort of gig, the benefits of being able to mass-delete a tonne
+# of log entries outweight the benefits of such a safeguard.
+
+DATA_UPLOAD_MAX_NUMBER_FIELDS = None
+
+
 # Document classification models location
 MODEL_FILE = os.getenv(
    "PAPERLESS_MODEL_FILE", os.path.join(BASE_DIR, "..", "models", "model.pickle"))
@@ -252,6 +266,9 @@ CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
 CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
 CONVERT_DENSITY = os.getenv("PAPERLESS_CONVERT_DENSITY")

+# OptiPNG
+OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
+
 # Unpaper
 UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")

@@ -298,6 +315,7 @@ FY_END = os.getenv("PAPERLESS_FINANCIAL_YEAR_END")

 # Specify the default date order (for autodetected dates)
 DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
+FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")

 # Specify for how many years a correspondent is considered recent. Recent
 # correspondents will be shown in a separate "Recent correspondents" filter as
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1 +1 @@
-__version__ = (2, 3, 0)
+__version__ = (2, 6, 0)
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -4,7 +4,6 @@ import re
 import subprocess
 from multiprocessing.pool import Pool

-import dateparser
 import langdetect
 import pyocr
 from django.conf import settings
@@ -14,7 +13,7 @@ from pyocr.libtesseract.tesseract_raw import \
 from pyocr.tesseract import TesseractError

 import pdftotext
-from documents.parsers import DocumentParser, ParseError, DATE_REGEX
+from documents.parsers import DocumentParser, ParseError

 from .languages import ISO639

@@ -33,7 +32,6 @@ class RasterisedDocumentParser(DocumentParser):
    DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
-    DATE_ORDER = settings.DATE_ORDER
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS

@@ -46,15 +44,18 @@ class RasterisedDocumentParser(DocumentParser):
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """

+        out_path = os.path.join(self.tempdir, "convert.png")
+
+        # Run convert to get a decent thumbnail
        run_convert(
            self.CONVERT,
            "-scale", "500x5000",
            "-alpha", "remove",
            "{}[0]".format(self.document_path),
-            os.path.join(self.tempdir, "convert.png")
+            out_path
        )

-        return os.path.join(self.tempdir, "convert.png")
+        return out_path

    def _is_ocred(self):

@@ -152,7 +153,10 @@ class RasterisedDocumentParser(DocumentParser):
                )
                raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
                return raw_text
-            raise OCRError("Language detection failed")
+            error_msg = ("Language detection failed. Set "
+                         "PAPERLESS_FORGIVING_OCR in config file to continue "
+                         "anyway.")
+            raise OCRError(error_msg)

        if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
            raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
@@ -202,40 +206,6 @@ class RasterisedDocumentParser(DocumentParser):
        text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
        return text

-    def get_date(self):
-        date = None
-        datestring = None
-
-        try:
-            text = self.get_text()
-        except ParseError as e:
-            return None
-
-        # Iterate through all regex matches and try to parse the date
-        for m in re.finditer(DATE_REGEX, text):
-            datestring = m.group(0)
-
-            try:
-                date = dateparser.parse(
-                           datestring,
-                           settings={'DATE_ORDER': self.DATE_ORDER,
-                                     'PREFER_DAY_OF_MONTH': 'first',
-                                     'RETURN_AS_TIMEZONE_AWARE': True})
-            except TypeError:
-                # Skip all matches that do not parse to a proper date
-                continue
-
-            if date is not None:
-                break
-
-        if date is not None:
-            self.log("info", "Detected document date " + date.isoformat() +
-                             " based on string " + datestring)
-        else:
-            self.log("info", "Unable to detect date for document")
-
-        return date
-

 def run_convert(*args):

@@ -251,7 +221,8 @@ def run_convert(*args):

 def run_unpaper(args):
    unpaper, pnm = args
-    command_args = unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm")
+    command_args = (unpaper, "--overwrite", pnm,
+                    pnm.replace(".pnm", ".unpaper.pnm"))
    if not subprocess.Popen(command_args).wait() == 0:
        raise ParseError("Unpaper failed at {}".format(command_args))

--- a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf
+++ b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.pdf
--- a/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png
+++ b/src/paperless_tesseract/tests/samples/2013-12-11_tests_date_in_filename_2.png
--- a/src/paperless_tesseract/tests/samples/tests_date_3.pdf
+++ b/src/paperless_tesseract/tests/samples/tests_date_3.pdf
--- a/src/paperless_tesseract/tests/samples/tests_date_3.png
+++ b/src/paperless_tesseract/tests/samples/tests_date_3.png
--- a/src/paperless_tesseract/tests/samples/tests_date_4.pdf
+++ b/src/paperless_tesseract/tests/samples/tests_date_4.pdf
--- a/src/paperless_tesseract/tests/samples/tests_date_4.png
+++ b/src/paperless_tesseract/tests/samples/tests_date_4.png
--- a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf
+++ b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.pdf
--- a/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png
+++ b/src/paperless_tesseract/tests/samples/tests_date_in_filename_2018-03-20_1.png
--- a/src/paperless_tesseract/tests/test_date.py
+++ b/src/paperless_tesseract/tests/test_date.py
@@ -5,9 +5,10 @@ from unittest import mock
 from uuid import uuid4

 from dateutil import tz
-from django.test import TestCase
+from django.test import TestCase, override_settings

 from ..parsers import RasterisedDocumentParser
+from django.conf import settings


 class TestDate(TestCase):
@@ -59,9 +60,13 @@ class TestDate(TestCase):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file)
        document._text = "lorem ipsum 13.02.2018 lorem ipsum"
+        date = document.get_date()
        self.assertEqual(
-            document.get_date(),
-            datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
+            date,
+            datetime.datetime(
+                2018, 2, 13, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -72,10 +77,16 @@ class TestDate(TestCase):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file)
        document._text = (
-            "lorem ipsum 130218, 2018, 20180213 and 13.02.2018 lorem ipsum")
+            "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
+            "ipsum"
+        )
+        date = document.get_date()
        self.assertEqual(
-            document.get_date(),
-            datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.tzutc())
+            date,
+            datetime.datetime(
+                2018, 2, 13, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -110,9 +121,13 @@ class TestDate(TestCase):
            "März 2019\n"
            "lorem ipsum"
        )
+        date = document.get_date()
        self.assertEqual(
-            document.get_date(),
-            datetime.datetime(2019, 3, 1, 0, 0, tzinfo=tz.tzutc())
+            date,
+            datetime.datetime(
+                2019, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -122,19 +137,25 @@ class TestDate(TestCase):
    def test_date_format_8(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file)
-        document._text = ("lorem ipsum\n"
-                          "Wohnort\n"
-                          "3100\n"
-                          "IBAN\n"
-                          "AT87 4534\n"
-                          "1234\n"
-                          "1234 5678\n"
-                          "BIC\n"
-                          "lorem ipsum\n"
-                          "März 2020")
-        self.assertEqual(document.get_date(),
-                         datetime.datetime(2020, 3, 1, 0, 0,
-                                           tzinfo=tz.tzutc()))
+        document._text = (
+            "lorem ipsum\n"
+            "Wohnort\n"
+            "3100\n"
+            "IBAN\n"
+            "AT87 4534\n"
+            "1234\n"
+            "1234 5678\n"
+            "BIC\n"
+            "lorem ipsum\n"
+            "März 2020"
+        )
+        self.assertEqual(
+            document.get_date(),
+            datetime.datetime(
+                2020, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )

    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -143,13 +164,19 @@ class TestDate(TestCase):
    def test_date_format_9(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file)
-        document._text = ("lorem ipsum\n"
-                          "27. Nullmonth 2020\n"
-                          "März 2020\n"
-                          "lorem ipsum")
-        self.assertEqual(document.get_date(),
-                         datetime.datetime(2020, 3, 1, 0, 0,
-                                           tzinfo=tz.tzutc()))
+        document._text = (
+            "lorem ipsum\n"
+            "27. Nullmonth 2020\n"
+            "März 2020\n"
+            "lorem ipsum"
+        )
+        self.assertEqual(
+            document.get_date(),
+            datetime.datetime(
+                2020, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )

    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
@@ -158,11 +185,16 @@ class TestDate(TestCase):
    def test_get_text_1_pdf(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
+        date = document.get_date()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
-            document.get_date(),
-            datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
+            date,
+            datetime.datetime(
+                2018, 4, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -172,11 +204,15 @@ class TestDate(TestCase):
    def test_get_text_1_png(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_1.png")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), False)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 4, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -186,11 +222,15 @@ class TestDate(TestCase):
    def test_get_text_2_pdf(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2013, 2, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -200,67 +240,91 @@ class TestDate(TestCase):
    def test_get_text_2_png(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_2.png")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), False)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2013, 2, 1, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2013, 2, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
        SCRATCH
    )
+    @override_settings(OCR_LANGUAGE="deu")
    def test_get_text_3_pdf(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 10, 5, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
        SCRATCH
    )
+    @override_settings(OCR_LANGUAGE="deu")
    def test_get_text_3_png(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_3.png")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), False)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 10, 5, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
        SCRATCH
    )
+    @override_settings(OCR_LANGUAGE="eng")
    def test_get_text_4_pdf(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 10, 5, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
        SCRATCH
    )
+    @override_settings(OCR_LANGUAGE="eng")
    def test_get_text_4_png(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_4.png")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), False)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 10, 5, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 10, 5, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -270,11 +334,15 @@ class TestDate(TestCase):
    def test_get_text_5_pdf(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 12, 17, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -284,11 +352,15 @@ class TestDate(TestCase):
    def test_get_text_5_png(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_5.png")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), False)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 12, 17, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -303,7 +375,10 @@ class TestDate(TestCase):
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 12, 17, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -318,7 +393,10 @@ class TestDate(TestCase):
        self.assertEqual(document._is_ocred(), False)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 12, 17, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 12, 17, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -328,6 +406,7 @@ class TestDate(TestCase):
    def test_get_text_6_pdf_eu(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(document.get_date(), None)
@@ -339,6 +418,7 @@ class TestDate(TestCase):
    def test_get_text_6_png_eu(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_6.png")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), False)
        self.assertEqual(document.get_date(), None)
@@ -350,11 +430,15 @@ class TestDate(TestCase):
    def test_get_text_7_pdf(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_7.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2018, 4, 1, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2018, 4, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -364,11 +448,15 @@ class TestDate(TestCase):
    def test_get_text_8_pdf(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_8.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2017, 12, 31, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )

    @mock.patch(
@@ -378,9 +466,137 @@ class TestDate(TestCase):
    def test_get_text_9_pdf(self):
        input_file = os.path.join(self.SAMPLE_FILES, "tests_date_9.pdf")
        document = RasterisedDocumentParser(input_file)
+        document.DATE_ORDER = 'DMY'
        document.get_text()
        self.assertEqual(document._is_ocred(), True)
        self.assertEqual(
            document.get_date(),
-            datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc())
+            datetime.datetime(
+                2017, 12, 31, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
        )
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_filename_date_1_pdf(self):
+        input_file = os.path.join(
+            self.SAMPLE_FILES,
+            "tests_date_in_filename_2018-03-20_1.pdf"
+        )
+        document = RasterisedDocumentParser(input_file)
+        document.FILENAME_DATE_ORDER = 'YMD'
+        document.get_text()
+        date = document.get_date()
+        self.assertEqual(document._is_ocred(), True)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2018, 3, 20, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_filename_date_1_png(self):
+        input_file = os.path.join(
+            self.SAMPLE_FILES,
+            "tests_date_in_filename_2018-03-20_1.png"
+        )
+        document = RasterisedDocumentParser(input_file)
+        document.FILENAME_DATE_ORDER = 'YMD'
+        date = document.get_date()
+        self.assertEqual(document._is_ocred(), False)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2018, 3, 20, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_filename_date_2_pdf(self):
+        input_file = os.path.join(
+            self.SAMPLE_FILES,
+            "2013-12-11_tests_date_in_filename_2.pdf"
+        )
+        document = RasterisedDocumentParser(input_file)
+        document.FILENAME_DATE_ORDER = 'YMD'
+        date = document.get_date()
+        self.assertEqual(document._is_ocred(), True)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2013, 12, 11, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_filename_date_2_png(self):
+        input_file = os.path.join(
+            self.SAMPLE_FILES,
+            "2013-12-11_tests_date_in_filename_2.png"
+        )
+        document = RasterisedDocumentParser(input_file)
+        document.FILENAME_DATE_ORDER = 'YMD'
+        date = document.get_date()
+        self.assertEqual(document._is_ocred(), False)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2013, 12, 11, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
+        return_value="01-07-0590 00:00:00"
+    )
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_crazy_date_past(self, *args):
+        document = RasterisedDocumentParser("/dev/null")
+        document.get_text()
+        self.assertIsNone(document.get_date())
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
+        return_value="01-07-2350 00:00:00"
+    )
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_crazy_date_future(self, *args):
+        document = RasterisedDocumentParser("/dev/null")
+        document.get_text()
+        self.assertIsNone(document.get_date())
+
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
+        return_value="01-07-0590 00:00:00"
+    )
+    @mock.patch(
+        "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
+        SCRATCH
+    )
+    def test_crazy_date_past(self, *args):
+        document = RasterisedDocumentParser("/dev/null")
+        document.get_text()
+        self.assertIsNone(document.get_date())
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -1,11 +1,9 @@
 import os
-import re
 import subprocess

-import dateparser
 from django.conf import settings

-from documents.parsers import DocumentParser, ParseError, DATE_REGEX
+from documents.parsers import DocumentParser, ParseError


 class TextDocumentParser(DocumentParser):
@@ -16,7 +14,6 @@ class TextDocumentParser(DocumentParser):
    CONVERT = settings.CONVERT_BINARY
    THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
    UNPAPER = settings.UNPAPER_BINARY
-    DATE_ORDER = settings.DATE_ORDER
    DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
    OCR_ALWAYS = settings.OCR_ALWAYS

@@ -26,7 +23,7 @@ class TextDocumentParser(DocumentParser):

    def get_thumbnail(self):
        """
-        The thumbnail of a txt is just a 500px wide image of the text
+        The thumbnail of a text file is just a 500px wide image of the text
        rendered onto a letter-sized page.
        """
        # The below is heavily cribbed from https://askubuntu.com/a/590951
@@ -35,7 +32,7 @@ class TextDocumentParser(DocumentParser):
        text_color = "black"  # text color
        psize = [500, 647]  # icon size
        n_lines = 50  # number of lines to show
-        output_file = os.path.join(self.tempdir, "convert-txt.png")
+        out_path = os.path.join(self.tempdir, "convert.png")

        temp_bg = os.path.join(self.tempdir, "bg.png")
        temp_txlayer = os.path.join(self.tempdir, "tx.png")
@@ -46,9 +43,13 @@ class TextDocumentParser(DocumentParser):
            work_size = ",".join([str(n - 1) for n in psize])
            r = str(round(psize[0] / 10))
            rounded = ",".join([r, r])
-            run_command(self.CONVERT, "-size ", picsize, ' xc:none -draw ',
-                        '"fill ', bg_color, ' roundrectangle 0,0,',
-                        work_size, ",", rounded, '" ', temp_bg)
+            run_command(
+                self.CONVERT,
+                "-size ", picsize,
+                ' xc:none -draw ',
+                '"fill ', bg_color, ' roundrectangle 0,0,', work_size, ",", rounded, '" ',  # NOQA: E501
+                temp_bg
+            )

        def read_text():
            with open(self.document_path, 'r') as src:
@@ -57,22 +58,29 @@ class TextDocumentParser(DocumentParser):
                return text.replace('"', "'")

        def create_txlayer():
-            run_command(self.CONVERT,
-                        "-background none",
-                        "-fill",
-                        text_color,
-                        "-pointsize", "12",
-                        "-border 4 -bordercolor none",
-                        "-size ", txsize,
-                        ' caption:"', read_text(), '" ',
-                        temp_txlayer)
+            run_command(
+                self.CONVERT,
+                "-background none",
+                "-fill",
+                text_color,
+                "-pointsize", "12",
+                "-border 4 -bordercolor none",
+                "-size ", txsize,
+                ' caption:"', read_text(), '" ',
+                temp_txlayer
+            )

        create_txlayer()
        create_bg()
-        run_command(self.CONVERT, temp_bg, temp_txlayer,
-                    "-background None -layers merge ", output_file)
+        run_command(
+            self.CONVERT,
+            temp_bg,
+            temp_txlayer,
+            "-background None -layers merge ",
+            out_path
+        )

-        return output_file
+        return out_path

    def get_text(self):

@@ -84,40 +92,6 @@ class TextDocumentParser(DocumentParser):

        return self._text

-    def get_date(self):
-        date = None
-        datestring = None
-
-        try:
-            text = self.get_text()
-        except ParseError as e:
-            return None
-
-        # Iterate through all regex matches and try to parse the date
-        for m in re.finditer(DATE_REGEX, text):
-            datestring = m.group(0)
-
-            try:
-                date = dateparser.parse(
-                           datestring,
-                           settings={'DATE_ORDER': self.DATE_ORDER,
-                                     'PREFER_DAY_OF_MONTH': 'first',
-                                     'RETURN_AS_TIMEZONE_AWARE': True})
-            except TypeError:
-                # Skip all matches that do not parse to a proper date
-                continue
-
-            if date is not None:
-                break
-
-        if date is not None:
-            self.log("info", "Detected document date " + date.isoformat() +
-                             " based on string " + datestring)
-        else:
-            self.log("info", "Unable to detect date for document")
-
-        return date
-

 def run_command(*args):
    environment = os.environ.copy()
--- a/src/reminders/migrations/0002_auto_20181007_1420.py
+++ b/src/reminders/migrations/0002_auto_20181007_1420.py
@@ -0,0 +1,19 @@
+# Generated by Django 2.0.8 on 2018-10-07 14:20
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('reminders', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='reminder',
+            name='document',
+            field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='documents.Document'),
+        ),
+    ]
--- a/src/reminders/models.py
+++ b/src/reminders/models.py
@@ -4,7 +4,6 @@ from django.db import models
 class Reminder(models.Model):

    document = models.ForeignKey(
-        "documents.Document", on_delete=models.PROTECT
-        )
+        "documents.Document", on_delete=models.PROTECT)
    date = models.DateTimeField()
    note = models.TextField(blank=True)