Merge branch 'dev' into feature-autocolor

This commit is contained in:
jonaswinkler
2021-02-24 23:10:59 +01:00
378 changed files with 37340 additions and 5324 deletions

View File

@@ -1,18 +1,22 @@
# coding=utf-8
import datetime
import logging
import os
import re
from collections import OrderedDict
import pathvalidate
import dateutil.parser
from colorhash import ColorHash
from django.conf import settings
from django.contrib.auth.models import User
from django.db import models
from django.utils import timezone
from django.utils.text import slugify
from django.utils.timezone import is_aware
from django.utils.translation import gettext_lazy as _
from documents.file_handling import archive_name_from_filename
from documents.parsers import get_default_file_extension
@@ -26,37 +30,31 @@ class MatchingModel(models.Model):
MATCH_AUTO = 6
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
(MATCH_AUTO, "Automatic Classification"),
(MATCH_ANY, _("Any word")),
(MATCH_ALL, _("All words")),
(MATCH_LITERAL, _("Exact match")),
(MATCH_REGEX, _("Regular expression")),
(MATCH_FUZZY, _("Fuzzy word")),
(MATCH_AUTO, _("Automatic")),
)
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True, editable=False)
name = models.CharField(
_("name"),
max_length=128, unique=True)
match = models.CharField(
_("match"),
max_length=256, blank=True)
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
_("matching algorithm"),
choices=MATCHING_ALGORITHMS,
default=MATCH_ANY,
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
"PDF. Here, \"any\" looks for any occurrence of any word "
"provided in the PDF, while \"all\" requires that every word "
"provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" looks for words or phrases that are mostly—but not "
"exactly—the same, which can be useful for matching against "
"documents containg imperfections that foil accurate OCR."
)
default=MATCH_ANY
)
is_insensitive = models.BooleanField(default=True)
is_insensitive = models.BooleanField(
_("is insensitive"),
default=True)
class Meta:
abstract = True
@@ -65,13 +63,6 @@ class MatchingModel(models.Model):
def __str__(self):
return self.name
def save(self, *args, **kwargs):
self.match = self.match.lower()
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)
class Correspondent(MatchingModel):
@@ -81,18 +72,27 @@ class Correspondent(MatchingModel):
class Meta:
ordering = ("name",)
verbose_name = _("correspondent")
verbose_name_plural = _("correspondents")
class Tag(MatchingModel):
colour = models.CharField(blank=True, max_length=7)
colour = models.CharField(
_("color"),
blank=True, max_length=7)
is_inbox_tag = models.BooleanField(
_("is inbox tag"),
default=False,
help_text="Marks this tag as an inbox tag: All newly consumed "
"documents will be tagged with inbox tags."
help_text=_("Marks this tag as an inbox tag: All newly consumed "
"documents will be tagged with inbox tags.")
)
class Meta:
verbose_name = _("tag")
verbose_name_plural = _("tags")
def save(self, *args, **kwargs):
if self.colour == "":
self.colour = ColorHash(
@@ -105,7 +105,9 @@ class Tag(MatchingModel):
class DocumentType(MatchingModel):
pass
class Meta:
verbose_name = _("document type")
verbose_name_plural = _("document types")
class Document(models.Model):
@@ -113,8 +115,8 @@ class Document(models.Model):
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = (
(STORAGE_TYPE_UNENCRYPTED, "Unencrypted"),
(STORAGE_TYPE_GPG, "Encrypted with GNU Privacy Guard")
(STORAGE_TYPE_UNENCRYPTED, _("Unencrypted")),
(STORAGE_TYPE_GPG, _("Encrypted with GNU Privacy Guard"))
)
correspondent = models.ForeignKey(
@@ -122,54 +124,68 @@ class Document(models.Model):
blank=True,
null=True,
related_name="documents",
on_delete=models.SET_NULL
on_delete=models.SET_NULL,
verbose_name=_("correspondent")
)
title = models.CharField(max_length=128, blank=True, db_index=True)
title = models.CharField(
_("title"),
max_length=128, blank=True, db_index=True)
document_type = models.ForeignKey(
DocumentType,
blank=True,
null=True,
related_name="documents",
on_delete=models.SET_NULL
on_delete=models.SET_NULL,
verbose_name=_("document type")
)
content = models.TextField(
_("content"),
blank=True,
help_text="The raw, text-only data of the document. This field is "
"primarily used for searching."
help_text=_("The raw, text-only data of the document. This field is "
"primarily used for searching.")
)
mime_type = models.CharField(
_("mime type"),
max_length=256,
editable=False
)
tags = models.ManyToManyField(
Tag, related_name="documents", blank=True)
Tag, related_name="documents", blank=True,
verbose_name=_("tags")
)
checksum = models.CharField(
_("checksum"),
max_length=32,
editable=False,
unique=True,
help_text="The checksum of the original document."
help_text=_("The checksum of the original document.")
)
archive_checksum = models.CharField(
_("archive checksum"),
max_length=32,
editable=False,
blank=True,
null=True,
help_text="The checksum of the archived document."
help_text=_("The checksum of the archived document.")
)
created = models.DateTimeField(
_("created"),
default=timezone.now, db_index=True)
modified = models.DateTimeField(
_("modified"),
auto_now=True, editable=False, db_index=True)
storage_type = models.CharField(
_("storage type"),
max_length=11,
choices=STORAGE_TYPES,
default=STORAGE_TYPE_UNENCRYPTED,
@@ -177,36 +193,53 @@ class Document(models.Model):
)
added = models.DateTimeField(
_("added"),
default=timezone.now, editable=False, db_index=True)
filename = models.FilePathField(
_("filename"),
max_length=1024,
editable=False,
default=None,
unique=True,
null=True,
help_text="Current filename in storage"
help_text=_("Current filename in storage")
)
archive_filename = models.FilePathField(
_("archive filename"),
max_length=1024,
editable=False,
default=None,
unique=True,
null=True,
help_text=_("Current archive filename in storage")
)
archive_serial_number = models.IntegerField(
_("archive serial number"),
blank=True,
null=True,
unique=True,
db_index=True,
help_text="The position of this document in your physical document "
"archive."
help_text=_("The position of this document in your physical document "
"archive.")
)
class Meta:
ordering = ("correspondent", "title")
ordering = ("-created",)
verbose_name = _("document")
verbose_name_plural = _("documents")
def __str__(self):
created = self.created.strftime("%Y%m%d")
if is_aware(self.created):
created = timezone.localdate(self.created).isoformat()
else:
created = datetime.date.isoformat(self.created)
if self.correspondent and self.title:
return "{}: {} - {}".format(
created, self.correspondent, self.title)
if self.correspondent or self.title:
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)
return f"{created} {self.correspondent} {self.title}"
else:
return f"{created} {self.title}"
@property
def source_path(self):
@@ -215,7 +248,7 @@ class Document(models.Model):
else:
fname = "{:07}{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg"
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
@@ -227,28 +260,38 @@ class Document(models.Model):
return open(self.source_path, "rb")
@property
def archive_path(self):
if self.filename:
fname = archive_name_from_filename(self.filename)
else:
fname = "{:07}.pdf".format(self.pk)
def has_archive_version(self):
return self.archive_filename is not None
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
@property
def archive_path(self):
if self.has_archive_version:
return os.path.join(
settings.ARCHIVE_DIR,
str(self.archive_filename)
)
else:
return None
@property
def archive_file(self):
return open(self.archive_path, "rb")
@property
def file_name(self):
return slugify(str(self)) + self.file_type
def get_public_filename(self, archive=False, counter=0, suffix=None):
result = str(self)
@property
def archive_file_name(self):
return slugify(str(self)) + ".pdf"
if counter:
result += f"_{counter:02}"
if suffix:
result += suffix
if archive:
result += ".pdf"
else:
result += self.file_type
return pathvalidate.sanitize_filename(result, replacement_text="-")
@property
def file_type(self):
@@ -273,76 +316,116 @@ class Document(models.Model):
class Log(models.Model):
LEVELS = (
(logging.DEBUG, "Debugging"),
(logging.INFO, "Informational"),
(logging.WARNING, "Warning"),
(logging.ERROR, "Error"),
(logging.CRITICAL, "Critical"),
(logging.DEBUG, _("debug")),
(logging.INFO, _("information")),
(logging.WARNING, _("warning")),
(logging.ERROR, _("error")),
(logging.CRITICAL, _("critical")),
)
group = models.UUIDField(blank=True, null=True)
message = models.TextField()
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
created = models.DateTimeField(auto_now_add=True)
group = models.UUIDField(
_("group"),
blank=True, null=True)
message = models.TextField(_("message"))
level = models.PositiveIntegerField(
_("level"),
choices=LEVELS, default=logging.INFO)
created = models.DateTimeField(_("created"), auto_now_add=True)
class Meta:
ordering = ("-created",)
verbose_name = _("log")
verbose_name_plural = _("logs")
def __str__(self):
return self.message
class SavedView(models.Model):
class Meta:
ordering = ("name",)
verbose_name = _("saved view")
verbose_name_plural = _("saved views")
user = models.ForeignKey(User, on_delete=models.CASCADE,
verbose_name=_("user"))
name = models.CharField(
_("name"),
max_length=128)
show_on_dashboard = models.BooleanField(
_("show on dashboard"),
)
show_in_sidebar = models.BooleanField(
_("show in sidebar"),
)
sort_field = models.CharField(
_("sort field"),
max_length=128)
sort_reverse = models.BooleanField(
_("sort reverse"),
default=False)
class SavedViewFilterRule(models.Model):
RULE_TYPES = [
(0, _("title contains")),
(1, _("content contains")),
(2, _("ASN is")),
(3, _("correspondent is")),
(4, _("document type is")),
(5, _("is in inbox")),
(6, _("has tag")),
(7, _("has any tag")),
(8, _("created before")),
(9, _("created after")),
(10, _("created year is")),
(11, _("created month is")),
(12, _("created day is")),
(13, _("added before")),
(14, _("added after")),
(15, _("modified before")),
(16, _("modified after")),
(17, _("does not have tag")),
]
saved_view = models.ForeignKey(
SavedView,
on_delete=models.CASCADE,
related_name="filter_rules",
verbose_name=_("saved view")
)
rule_type = models.PositiveIntegerField(
_("rule type"),
choices=RULE_TYPES)
value = models.CharField(
_("value"),
max_length=128,
blank=True,
null=True)
class Meta:
verbose_name = _("filter rule")
verbose_name_plural = _("filter rules")
# TODO: why is this in the models file?
class FileInfo:
# This epic regex *almost* worked for our needs, so I'm keeping it here for
# posterity, in the hopes that we might find a way to make it work one day.
ALMOST_REGEX = re.compile(
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
r"((?P<correspondent>{non_separated_word}+){separator})??"
r"(?P<title>{non_separated_word}+)"
r"({separator}(?P<tags>[a-z,0-9-]+))?"
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
separator=r"\s+-\s+",
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)$",
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)$",
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)$",
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)$",
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)$",
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?$",
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)$",
flags=re.IGNORECASE
@@ -365,28 +448,10 @@ class FileInfo:
except ValueError:
return None
@classmethod
def _get_correspondent(cls, name):
if not name:
return None
return Correspondent.objects.get_or_create(name=name, defaults={
"slug": slugify(name)
})[0]
@classmethod
def _get_title(cls, title):
return title
@classmethod
def _get_tags(cls, tags):
r = []
for t in tags.split(","):
r.append(Tag.objects.get_or_create(
slug=slugify(t),
defaults={"name": t}
)[0])
return tuple(r)
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
@@ -396,15 +461,6 @@ class FileInfo:
@classmethod
def from_filename(cls, filename):
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<date> - <correspondent> - <title> - <tags>"
"<correspondent> - <title> - <tags>"
"<correspondent> - <title>"
"<title>"
"""
# Mutate filename in-place before parsing its components
# by applying at most one of the configured transformations.
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
@@ -435,7 +491,5 @@ class FileInfo:
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")
cls._mangle_property(properties, "correspondent")
cls._mangle_property(properties, "title")
cls._mangle_property(properties, "tags")
return cls(**properties)