Merge remote-tracking branch 'upstream/master'

2026-01-30 23:08:59 -06:00 · 2018-12-11 12:06:15 +01:00
parent 94ede7389d 3c2a1a8c13
commit 766109ae4e
46 changed files with 1171 additions and 695 deletions
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -64,12 +64,12 @@ class FinancialYearFilter(admin.SimpleListFilter):

            # To keep it simple we use the same string for both
            # query parameter and the display.
-            return (query, query)
+            return query, query

        else:
            query = "{0}-{0}".format(date.year)
            display = "{}".format(date.year)
-            return (query, display)
+            return query, display

    def lookups(self, request, model_admin):
        if not settings.FY_START or not settings.FY_END:
@@ -91,25 +91,24 @@ class FinancialYearFilter(admin.SimpleListFilter):


 class RecentCorrespondentFilter(admin.RelatedFieldListFilter):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.title = "correspondent (recent)"
+    """
+    If PAPERLESS_RECENT_CORRESPONDENT_YEARS is set, we limit the available
+    correspondents to documents sent our way over the past ``n`` years.
+    """

    def field_choices(self, field, request, model_admin):

        years = settings.PAPERLESS_RECENT_CORRESPONDENT_YEARS
-        days = 365 * years
+        correspondents = Correspondent.objects.all()

-        lookups = []
        if years and years > 0:
-            correspondents = Correspondent.objects.filter(
+            self.title = "Correspondent (Recent)"
+            days = 365 * years
+            correspondents = correspondents.filter(
                documents__created__gte=datetime.now() - timedelta(days=days)
            ).distinct()
-            for c in correspondents:
-                lookups.append((c.id, c.name))

-        return lookups
+        return [(c.id, c.name) for c in correspondents]


 class CommonAdmin(admin.ModelAdmin):
@@ -124,7 +123,9 @@ class CorrespondentAdmin(CommonAdmin):
        "document_count",
        "last_correspondence"
    )
-    list_editable = ("automatic_classification")
+    list_editable = ("automatic_classification",)
+
+    readonly_fields = ("slug",)

    def get_queryset(self, request):
        qs = super(CorrespondentAdmin, self).get_queryset(request)
@@ -149,6 +150,11 @@ class TagAdmin(CommonAdmin):
    list_filter = ("colour",)
    list_editable = ("colour", "automatic_classification")

+    readonly_fields = ("slug",)
+
+    class Media:
+        js = ("js/colours.js",)
+
    def get_queryset(self, request):
        qs = super(TagAdmin, self).get_queryset(request)
        qs = qs.annotate(document_count=models.Count("documents"))
@@ -164,6 +170,8 @@ class DocumentTypeAdmin(CommonAdmin):
    list_display = ("name", "automatic_classification", "document_count")
    list_editable = ("automatic_classification",)

+    readonly_fields = ("slug",)
+
    def get_queryset(self, request):
        qs = super(DocumentTypeAdmin, self).get_queryset(request)
        qs = qs.annotate(document_count=models.Count("documents"))
@@ -182,14 +190,13 @@ class DocumentAdmin(CommonAdmin):
        }

    search_fields = ("correspondent__name", "title", "content", "tags__name")
-    readonly_fields = ("added",)
+    readonly_fields = ("added", "file_type", "storage_type",)
    list_display = ("title", "created", "added", "thumbnail", "correspondent",
                    "tags_", "archive_serial_number", "document_type")
    list_filter = (
        "document_type",
        "tags",
        ("correspondent", RecentCorrespondentFilter),
-        "correspondent",
        FinancialYearFilter
    )

--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,3 +1,4 @@
+from django.db import transaction
 import datetime
 import hashlib
 import logging
@@ -111,8 +112,11 @@ class Consumer:
                if not self.try_consume_file(file):
                    self._ignore.append((file, mtime))

+    @transaction.atomic
    def try_consume_file(self, file):
-        "Return True if file was consumed"
+        """
+        Return True if file was consumed
+        """

        if not re.match(FileInfo.REGEXES["title"], file):
            return False
@@ -145,7 +149,7 @@ class Consumer:
        parsed_document = parser_class(doc)

        try:
-            thumbnail = parsed_document.get_thumbnail()
+            thumbnail = parsed_document.get_optimised_thumbnail()
            date = parsed_document.get_date()
            document = self._store(
                parsed_document.get_text(),
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -1,4 +1,4 @@
-from django_filters.rest_framework import CharFilter, FilterSet, BooleanFilter, ModelChoiceFilter
+from django_filters.rest_framework import BooleanFilter, FilterSet

 from .models import Correspondent, Document, Tag, DocumentType

--- a/src/documents/migrations/0022_auto_20181007_1420.py
+++ b/src/documents/migrations/0022_auto_20181007_1420.py
@@ -0,0 +1,52 @@
+# Generated by Django 2.0.8 on 2018-10-07 14:20
+
+from django.db import migrations, models
+from django.utils.text import slugify
+
+
+def re_slug_all_the_things(apps, schema_editor):
+    """
+    Rewrite all slug values to make sure they're actually slugs before we brand
+    them as uneditable.
+    """
+
+    Tag = apps.get_model("documents", "Tag")
+    Correspondent = apps.get_model("documents", "Correspondent")
+
+    for klass in (Tag, Correspondent):
+        for instance in klass.objects.all():
+            klass.objects.filter(
+                pk=instance.pk
+            ).update(
+                slug=slugify(instance.slug)
+            )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0021_document_storage_type'),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name='tag',
+            options={'ordering': ('name',)},
+        ),
+        migrations.AlterField(
+            model_name='correspondent',
+            name='slug',
+            field=models.SlugField(blank=True, editable=False),
+        ),
+        migrations.AlterField(
+            model_name='document',
+            name='file_type',
+            field=models.CharField(choices=[('pdf', 'PDF'), ('png', 'PNG'), ('jpg', 'JPG'), ('gif', 'GIF'), ('tiff', 'TIFF'), ('txt', 'TXT'), ('csv', 'CSV'), ('md', 'MD')], editable=False, max_length=4),
+        ),
+        migrations.AlterField(
+            model_name='tag',
+            name='slug',
+            field=models.SlugField(blank=True, editable=False),
+        ),
+        migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
+    ]
--- a/src/documents/migrations/1001_workflow_improvements.py
+++ b/src/documents/migrations/1001_workflow_improvements.py
@@ -6,7 +6,7 @@ from django.db import migrations, models
 class Migration(migrations.Migration):

    dependencies = [
-        ('documents', '0021_document_storage_type'),
+        ('documents', '0022_auto_20181007_1420'),
    ]

    operations = [
--- a/src/documents/migrations/1002_auto_20180823_1155.py
+++ b/src/documents/migrations/1002_auto_20180823_1155.py
@@ -7,7 +7,7 @@ import django.db.models.deletion
 class Migration(migrations.Migration):

    dependencies = [
-        ('documents', '0022_workflow_improvements'),
+        ('documents', '1001_workflow_improvements'),
    ]

    operations = [
--- a/src/documents/migrations/1003_auto_20180904_1425.py
+++ b/src/documents/migrations/1003_auto_20180904_1425.py
@@ -18,7 +18,7 @@ def reverse_automatic_classification(apps, schema_editor):
 class Migration(migrations.Migration):

    dependencies = [
-        ('documents', '0023_auto_20180823_1155'),
+        ('documents', '1002_auto_20180823_1155'),
    ]

    operations = [
--- a/src/documents/migrations/1004_documenttype_slug.py
+++ b/src/documents/migrations/1004_documenttype_slug.py
@@ -0,0 +1,36 @@
+# Generated by Django 2.0.8 on 2018-10-07 14:20
+
+from django.db import migrations, models
+from django.utils.text import slugify
+
+
+def re_slug_all_the_things(apps, schema_editor):
+    """
+    Rewrite all slug values to make sure they're actually slugs before we brand
+    them as uneditable.
+    """
+
+    DocumentType = apps.get_model("documents", "DocumentType")
+
+    for instance in DocumentType.objects.all():
+        DocumentType.objects.filter(
+            pk=instance.pk
+        ).update(
+            slug=slugify(instance.slug)
+        )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1003_auto_20180904_1425'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='documenttype',
+            name='slug',
+            field=models.SlugField(blank=True, editable=False),
+        ),
+        migrations.RunPython(re_slug_all_the_things, migrations.RunPython.noop)
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -11,6 +11,7 @@ from django.conf import settings
 from django.db import models
 from django.template.defaultfilters import slugify
 from django.utils import timezone
+from django.utils.text import slugify
 from fuzzywuzzy import fuzz

 from .managers import LogManager
@@ -24,7 +25,7 @@ except ImportError:
 class MatchingModel(models.Model):

    name = models.CharField(max_length=128, unique=True)
-    slug = models.SlugField(blank=True)
+    slug = models.SlugField(blank=True, editable=False)

    automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')

@@ -37,8 +38,7 @@ class MatchingModel(models.Model):

    def save(self, *args, **kwargs):

-        if not self.slug:
-            self.slug = slugify(self.name)
+        self.slug = slugify(self.name)

        models.Model.save(self, *args, **kwargs)

@@ -369,7 +369,7 @@ class FileInfo:
        r = []
        for t in tags.split(","):
            r.append(Tag.objects.get_or_create(
-                slug=t.lower(),
+                slug=slugify(t),
                defaults={"name": t}
            )[0])
        return tuple(r)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -1,23 +1,31 @@
 import logging
-import shutil
-import tempfile
+import os
 import re
+import shutil
+import subprocess
+import tempfile

+import dateparser
 from django.conf import settings
+from django.utils import timezone

 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
 # - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
+# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
 # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
 # - MONTH ZZZZ, with ZZZZ being 4 digits
 # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
 DATE_REGEX = re.compile(
-    r'\b([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})\b|' +
-    r'\b([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))\b|' +
-    r'\b([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))\b|' +
-    r'\b([^\W\d_]{3,9} [0-9]{4})\b'
+    r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' +  # NOQA: E501
+    r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' +  # NOQA: E501
+    r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' +  # NOQA: E501
+    r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
+    r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
 )


@@ -32,6 +40,9 @@ class DocumentParser:
    """

    SCRATCH = settings.SCRATCH_DIR
+    DATE_ORDER = settings.DATE_ORDER
+    FILENAME_DATE_ORDER = settings.FILENAME_DATE_ORDER
+    OPTIPNG = settings.OPTIPNG_BINARY

    def __init__(self, path):
        self.document_path = path
@@ -45,6 +56,19 @@ class DocumentParser:
        """
        raise NotImplementedError()

+    def optimise_thumbnail(self, in_path):
+
+        out_path = os.path.join(self.tempdir, "optipng.png")
+
+        args = (self.OPTIPNG, "-o5", in_path, "-out", out_path)
+        if not subprocess.Popen(args).wait() == 0:
+            raise ParseError("Optipng failed at {}".format(args))
+
+        return out_path
+
+    def get_optimised_thumbnail(self):
+        return self.optimise_thumbnail(self.get_thumbnail())
+
    def get_text(self):
        """
        Returns the text from the document and only the text.
@@ -55,7 +79,82 @@ class DocumentParser:
        """
        Returns the date of the document.
        """
-        raise NotImplementedError()
+
+        def __parser(ds, date_order):
+            """
+            Call dateparser.parse with a particular date ordering
+            """
+            return dateparser.parse(
+                ds,
+                settings={
+                    "DATE_ORDER": date_order,
+                    "PREFER_DAY_OF_MONTH": "first",
+                    "RETURN_AS_TIMEZONE_AWARE":
+                    True
+                }
+            )
+
+        date = None
+        date_string = None
+
+        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
+        title = os.path.basename(self.document_path)
+
+        # if filename date parsing is enabled, search there first:
+        if self.FILENAME_DATE_ORDER:
+            self.log("info", "Checking document title for date")
+            for m in re.finditer(DATE_REGEX, title):
+                date_string = m.group(0)
+
+                try:
+                    date = __parser(date_string, self.FILENAME_DATE_ORDER)
+                except TypeError:
+                    # Skip all matches that do not parse to a proper date
+                    continue
+
+                if date is not None and next_year > date.year > 1900:
+                    self.log(
+                        "info",
+                        "Detected document date {} based on string {} "
+                        "from document title"
+                        "".format(date.isoformat(), date_string)
+                    )
+                    return date
+
+        try:
+            # getting text after checking filename will save time if only
+            # looking at the filename instead of the whole text
+            text = self.get_text()
+        except ParseError:
+            return None
+
+        # Iterate through all regex matches in text and try to parse the date
+        for m in re.finditer(DATE_REGEX, text):
+            date_string = m.group(0)
+
+            try:
+                date = __parser(date_string, self.DATE_ORDER)
+            except TypeError:
+                # Skip all matches that do not parse to a proper date
+                continue
+
+            if date is not None and next_year > date.year > 1900:
+                break
+            else:
+                date = None
+
+        if date is not None:
+            self.log(
+                "info",
+                "Detected document date {} based on string {}".format(
+                    date.isoformat(),
+                    date_string
+                )
+            )
+        else:
+            self.log("info", "Unable to detect date for document")
+
+        return date

    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
--- a/src/documents/static/js/colours.js
+++ b/src/documents/static/js/colours.js
@@ -0,0 +1,66 @@
+// The following jQuery snippet will add a small square next to the selection
+// drop-down on the `Add tag` page that will update to show the selected tag
+// color as the drop-down value is changed.
+
+django.jQuery(document).ready(function(){
+
+  if (django.jQuery("#id_colour").length) {
+
+    let colour;
+    let colour_num;
+
+    colour_num = django.jQuery("#id_colour").val() - 1;
+    colour = django.jQuery('#id_colour')[0][colour_num].text;
+    django.jQuery('#id_colour').after('<div class="colour_square"></div>');
+
+    django.jQuery('.colour_square').css({
+      'float': 'left',
+      'width': '20px',
+      'height': '20px',
+      'margin': '5px',
+      'border': '1px solid rgba(0, 0, 0, .2)',
+      'background': colour
+    });
+
+    django.jQuery('#id_colour').change(function () {
+      colour_num = django.jQuery("#id_colour").val() - 1;
+      colour = django.jQuery('#id_colour')[0][colour_num].text;
+      django.jQuery('.colour_square').css({'background': colour});
+    });
+
+  } else if (django.jQuery("select[id*='colour']").length) {
+
+    django.jQuery('select[id*="-colour"]').each(function (index, element) {
+      let id;
+      let loop_colour_num;
+      let loop_colour;
+
+      id = "colour_square_" + index;
+      django.jQuery(element).after('<div class="colour_square" id="' + id + '"></div>');
+
+      loop_colour_num = django.jQuery(element).val() - 1;
+      loop_colour = django.jQuery(element)[0][loop_colour_num].text;
+
+      django.jQuery("<style type='text/css'>\
+                        .colour_square{ \
+                            float: left; \
+                            width: 20px; \
+                            height: 20px; \
+                            margin: 5px; \
+                            border: 1px solid rgba(0,0,0,.2); \
+                        } </style>").appendTo("head");
+      django.jQuery('#' + id).css({'background': loop_colour});
+
+      console.log(id, loop_colour_num, loop_colour);
+
+      django.jQuery(element).change(function () {
+        loop_colour_num = django.jQuery(element).val() - 1;
+        loop_colour = django.jQuery(element)[0][loop_colour_num].text;
+        django.jQuery('#' + id).css({'background': loop_colour});
+        console.log('#' + id, loop_colour)
+      });
+    })
+
+  }
+
+});