mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-14 00:26:21 +00:00
Merge branch 'dev' into feature-autocolor
This commit is contained in:
@@ -1,18 +1,22 @@
|
||||
# coding=utf-8
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
import pathvalidate
|
||||
|
||||
import dateutil.parser
|
||||
from colorhash import ColorHash
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
from django.utils.timezone import is_aware
|
||||
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from documents.file_handling import archive_name_from_filename
|
||||
from documents.parsers import get_default_file_extension
|
||||
|
||||
|
||||
@@ -26,37 +30,31 @@ class MatchingModel(models.Model):
|
||||
MATCH_AUTO = 6
|
||||
|
||||
MATCHING_ALGORITHMS = (
|
||||
(MATCH_ANY, "Any"),
|
||||
(MATCH_ALL, "All"),
|
||||
(MATCH_LITERAL, "Literal"),
|
||||
(MATCH_REGEX, "Regular Expression"),
|
||||
(MATCH_FUZZY, "Fuzzy Match"),
|
||||
(MATCH_AUTO, "Automatic Classification"),
|
||||
(MATCH_ANY, _("Any word")),
|
||||
(MATCH_ALL, _("All words")),
|
||||
(MATCH_LITERAL, _("Exact match")),
|
||||
(MATCH_REGEX, _("Regular expression")),
|
||||
(MATCH_FUZZY, _("Fuzzy word")),
|
||||
(MATCH_AUTO, _("Automatic")),
|
||||
)
|
||||
|
||||
name = models.CharField(max_length=128, unique=True)
|
||||
slug = models.SlugField(blank=True, editable=False)
|
||||
name = models.CharField(
|
||||
_("name"),
|
||||
max_length=128, unique=True)
|
||||
|
||||
match = models.CharField(
|
||||
_("match"),
|
||||
max_length=256, blank=True)
|
||||
|
||||
match = models.CharField(max_length=256, blank=True)
|
||||
matching_algorithm = models.PositiveIntegerField(
|
||||
_("matching algorithm"),
|
||||
choices=MATCHING_ALGORITHMS,
|
||||
default=MATCH_ANY,
|
||||
help_text=(
|
||||
"Which algorithm you want to use when matching text to the OCR'd "
|
||||
"PDF. Here, \"any\" looks for any occurrence of any word "
|
||||
"provided in the PDF, while \"all\" requires that every word "
|
||||
"provided appear in the PDF, albeit not in the order provided. A "
|
||||
"\"literal\" match means that the text you enter must appear in "
|
||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
||||
"uses a regex to match the PDF. (If you don't know what a regex "
|
||||
"is, you probably don't want this option.) Finally, a \"fuzzy "
|
||||
"match\" looks for words or phrases that are mostly—but not "
|
||||
"exactly—the same, which can be useful for matching against "
|
||||
"documents containg imperfections that foil accurate OCR."
|
||||
)
|
||||
default=MATCH_ANY
|
||||
)
|
||||
|
||||
is_insensitive = models.BooleanField(default=True)
|
||||
is_insensitive = models.BooleanField(
|
||||
_("is insensitive"),
|
||||
default=True)
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
@@ -65,13 +63,6 @@ class MatchingModel(models.Model):
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
|
||||
self.match = self.match.lower()
|
||||
self.slug = slugify(self.name)
|
||||
|
||||
models.Model.save(self, *args, **kwargs)
|
||||
|
||||
|
||||
class Correspondent(MatchingModel):
|
||||
|
||||
@@ -81,18 +72,27 @@ class Correspondent(MatchingModel):
|
||||
|
||||
class Meta:
|
||||
ordering = ("name",)
|
||||
verbose_name = _("correspondent")
|
||||
verbose_name_plural = _("correspondents")
|
||||
|
||||
|
||||
class Tag(MatchingModel):
|
||||
|
||||
colour = models.CharField(blank=True, max_length=7)
|
||||
colour = models.CharField(
|
||||
_("color"),
|
||||
blank=True, max_length=7)
|
||||
|
||||
is_inbox_tag = models.BooleanField(
|
||||
_("is inbox tag"),
|
||||
default=False,
|
||||
help_text="Marks this tag as an inbox tag: All newly consumed "
|
||||
"documents will be tagged with inbox tags."
|
||||
help_text=_("Marks this tag as an inbox tag: All newly consumed "
|
||||
"documents will be tagged with inbox tags.")
|
||||
)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("tag")
|
||||
verbose_name_plural = _("tags")
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if self.colour == "":
|
||||
self.colour = ColorHash(
|
||||
@@ -105,7 +105,9 @@ class Tag(MatchingModel):
|
||||
|
||||
class DocumentType(MatchingModel):
|
||||
|
||||
pass
|
||||
class Meta:
|
||||
verbose_name = _("document type")
|
||||
verbose_name_plural = _("document types")
|
||||
|
||||
|
||||
class Document(models.Model):
|
||||
@@ -113,8 +115,8 @@ class Document(models.Model):
|
||||
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
STORAGE_TYPES = (
|
||||
(STORAGE_TYPE_UNENCRYPTED, "Unencrypted"),
|
||||
(STORAGE_TYPE_GPG, "Encrypted with GNU Privacy Guard")
|
||||
(STORAGE_TYPE_UNENCRYPTED, _("Unencrypted")),
|
||||
(STORAGE_TYPE_GPG, _("Encrypted with GNU Privacy Guard"))
|
||||
)
|
||||
|
||||
correspondent = models.ForeignKey(
|
||||
@@ -122,54 +124,68 @@ class Document(models.Model):
|
||||
blank=True,
|
||||
null=True,
|
||||
related_name="documents",
|
||||
on_delete=models.SET_NULL
|
||||
on_delete=models.SET_NULL,
|
||||
verbose_name=_("correspondent")
|
||||
)
|
||||
|
||||
title = models.CharField(max_length=128, blank=True, db_index=True)
|
||||
title = models.CharField(
|
||||
_("title"),
|
||||
max_length=128, blank=True, db_index=True)
|
||||
|
||||
document_type = models.ForeignKey(
|
||||
DocumentType,
|
||||
blank=True,
|
||||
null=True,
|
||||
related_name="documents",
|
||||
on_delete=models.SET_NULL
|
||||
on_delete=models.SET_NULL,
|
||||
verbose_name=_("document type")
|
||||
)
|
||||
|
||||
content = models.TextField(
|
||||
_("content"),
|
||||
blank=True,
|
||||
help_text="The raw, text-only data of the document. This field is "
|
||||
"primarily used for searching."
|
||||
help_text=_("The raw, text-only data of the document. This field is "
|
||||
"primarily used for searching.")
|
||||
)
|
||||
|
||||
mime_type = models.CharField(
|
||||
_("mime type"),
|
||||
max_length=256,
|
||||
editable=False
|
||||
)
|
||||
|
||||
tags = models.ManyToManyField(
|
||||
Tag, related_name="documents", blank=True)
|
||||
Tag, related_name="documents", blank=True,
|
||||
verbose_name=_("tags")
|
||||
)
|
||||
|
||||
checksum = models.CharField(
|
||||
_("checksum"),
|
||||
max_length=32,
|
||||
editable=False,
|
||||
unique=True,
|
||||
help_text="The checksum of the original document."
|
||||
help_text=_("The checksum of the original document.")
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
_("archive checksum"),
|
||||
max_length=32,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text="The checksum of the archived document."
|
||||
help_text=_("The checksum of the archived document.")
|
||||
)
|
||||
|
||||
created = models.DateTimeField(
|
||||
_("created"),
|
||||
default=timezone.now, db_index=True)
|
||||
|
||||
modified = models.DateTimeField(
|
||||
_("modified"),
|
||||
auto_now=True, editable=False, db_index=True)
|
||||
|
||||
storage_type = models.CharField(
|
||||
_("storage type"),
|
||||
max_length=11,
|
||||
choices=STORAGE_TYPES,
|
||||
default=STORAGE_TYPE_UNENCRYPTED,
|
||||
@@ -177,36 +193,53 @@ class Document(models.Model):
|
||||
)
|
||||
|
||||
added = models.DateTimeField(
|
||||
_("added"),
|
||||
default=timezone.now, editable=False, db_index=True)
|
||||
|
||||
filename = models.FilePathField(
|
||||
_("filename"),
|
||||
max_length=1024,
|
||||
editable=False,
|
||||
default=None,
|
||||
unique=True,
|
||||
null=True,
|
||||
help_text="Current filename in storage"
|
||||
help_text=_("Current filename in storage")
|
||||
)
|
||||
|
||||
archive_filename = models.FilePathField(
|
||||
_("archive filename"),
|
||||
max_length=1024,
|
||||
editable=False,
|
||||
default=None,
|
||||
unique=True,
|
||||
null=True,
|
||||
help_text=_("Current archive filename in storage")
|
||||
)
|
||||
|
||||
archive_serial_number = models.IntegerField(
|
||||
_("archive serial number"),
|
||||
blank=True,
|
||||
null=True,
|
||||
unique=True,
|
||||
db_index=True,
|
||||
help_text="The position of this document in your physical document "
|
||||
"archive."
|
||||
help_text=_("The position of this document in your physical document "
|
||||
"archive.")
|
||||
)
|
||||
|
||||
class Meta:
|
||||
ordering = ("correspondent", "title")
|
||||
ordering = ("-created",)
|
||||
verbose_name = _("document")
|
||||
verbose_name_plural = _("documents")
|
||||
|
||||
def __str__(self):
|
||||
created = self.created.strftime("%Y%m%d")
|
||||
if is_aware(self.created):
|
||||
created = timezone.localdate(self.created).isoformat()
|
||||
else:
|
||||
created = datetime.date.isoformat(self.created)
|
||||
if self.correspondent and self.title:
|
||||
return "{}: {} - {}".format(
|
||||
created, self.correspondent, self.title)
|
||||
if self.correspondent or self.title:
|
||||
return "{}: {}".format(created, self.correspondent or self.title)
|
||||
return str(created)
|
||||
return f"{created} {self.correspondent} {self.title}"
|
||||
else:
|
||||
return f"{created} {self.title}"
|
||||
|
||||
@property
|
||||
def source_path(self):
|
||||
@@ -215,7 +248,7 @@ class Document(models.Model):
|
||||
else:
|
||||
fname = "{:07}{}".format(self.pk, self.file_type)
|
||||
if self.storage_type == self.STORAGE_TYPE_GPG:
|
||||
fname += ".gpg"
|
||||
fname += ".gpg" # pragma: no cover
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
@@ -227,28 +260,38 @@ class Document(models.Model):
|
||||
return open(self.source_path, "rb")
|
||||
|
||||
@property
|
||||
def archive_path(self):
|
||||
if self.filename:
|
||||
fname = archive_name_from_filename(self.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(self.pk)
|
||||
def has_archive_version(self):
|
||||
return self.archive_filename is not None
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
@property
|
||||
def archive_path(self):
|
||||
if self.has_archive_version:
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
str(self.archive_filename)
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def archive_file(self):
|
||||
return open(self.archive_path, "rb")
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + self.file_type
|
||||
def get_public_filename(self, archive=False, counter=0, suffix=None):
|
||||
result = str(self)
|
||||
|
||||
@property
|
||||
def archive_file_name(self):
|
||||
return slugify(str(self)) + ".pdf"
|
||||
if counter:
|
||||
result += f"_{counter:02}"
|
||||
|
||||
if suffix:
|
||||
result += suffix
|
||||
|
||||
if archive:
|
||||
result += ".pdf"
|
||||
else:
|
||||
result += self.file_type
|
||||
|
||||
return pathvalidate.sanitize_filename(result, replacement_text="-")
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
@@ -273,76 +316,116 @@ class Document(models.Model):
|
||||
class Log(models.Model):
|
||||
|
||||
LEVELS = (
|
||||
(logging.DEBUG, "Debugging"),
|
||||
(logging.INFO, "Informational"),
|
||||
(logging.WARNING, "Warning"),
|
||||
(logging.ERROR, "Error"),
|
||||
(logging.CRITICAL, "Critical"),
|
||||
(logging.DEBUG, _("debug")),
|
||||
(logging.INFO, _("information")),
|
||||
(logging.WARNING, _("warning")),
|
||||
(logging.ERROR, _("error")),
|
||||
(logging.CRITICAL, _("critical")),
|
||||
)
|
||||
|
||||
group = models.UUIDField(blank=True, null=True)
|
||||
message = models.TextField()
|
||||
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
|
||||
created = models.DateTimeField(auto_now_add=True)
|
||||
group = models.UUIDField(
|
||||
_("group"),
|
||||
blank=True, null=True)
|
||||
|
||||
message = models.TextField(_("message"))
|
||||
|
||||
level = models.PositiveIntegerField(
|
||||
_("level"),
|
||||
choices=LEVELS, default=logging.INFO)
|
||||
|
||||
created = models.DateTimeField(_("created"), auto_now_add=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ("-created",)
|
||||
verbose_name = _("log")
|
||||
verbose_name_plural = _("logs")
|
||||
|
||||
def __str__(self):
|
||||
return self.message
|
||||
|
||||
|
||||
class SavedView(models.Model):
|
||||
|
||||
class Meta:
|
||||
|
||||
ordering = ("name",)
|
||||
verbose_name = _("saved view")
|
||||
verbose_name_plural = _("saved views")
|
||||
|
||||
user = models.ForeignKey(User, on_delete=models.CASCADE,
|
||||
verbose_name=_("user"))
|
||||
name = models.CharField(
|
||||
_("name"),
|
||||
max_length=128)
|
||||
|
||||
show_on_dashboard = models.BooleanField(
|
||||
_("show on dashboard"),
|
||||
)
|
||||
show_in_sidebar = models.BooleanField(
|
||||
_("show in sidebar"),
|
||||
)
|
||||
|
||||
sort_field = models.CharField(
|
||||
_("sort field"),
|
||||
max_length=128)
|
||||
sort_reverse = models.BooleanField(
|
||||
_("sort reverse"),
|
||||
default=False)
|
||||
|
||||
|
||||
class SavedViewFilterRule(models.Model):
|
||||
RULE_TYPES = [
|
||||
(0, _("title contains")),
|
||||
(1, _("content contains")),
|
||||
(2, _("ASN is")),
|
||||
(3, _("correspondent is")),
|
||||
(4, _("document type is")),
|
||||
(5, _("is in inbox")),
|
||||
(6, _("has tag")),
|
||||
(7, _("has any tag")),
|
||||
(8, _("created before")),
|
||||
(9, _("created after")),
|
||||
(10, _("created year is")),
|
||||
(11, _("created month is")),
|
||||
(12, _("created day is")),
|
||||
(13, _("added before")),
|
||||
(14, _("added after")),
|
||||
(15, _("modified before")),
|
||||
(16, _("modified after")),
|
||||
(17, _("does not have tag")),
|
||||
]
|
||||
|
||||
saved_view = models.ForeignKey(
|
||||
SavedView,
|
||||
on_delete=models.CASCADE,
|
||||
related_name="filter_rules",
|
||||
verbose_name=_("saved view")
|
||||
)
|
||||
|
||||
rule_type = models.PositiveIntegerField(
|
||||
_("rule type"),
|
||||
choices=RULE_TYPES)
|
||||
|
||||
value = models.CharField(
|
||||
_("value"),
|
||||
max_length=128,
|
||||
blank=True,
|
||||
null=True)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("filter rule")
|
||||
verbose_name_plural = _("filter rules")
|
||||
|
||||
|
||||
# TODO: why is this in the models file?
|
||||
class FileInfo:
|
||||
|
||||
# This epic regex *almost* worked for our needs, so I'm keeping it here for
|
||||
# posterity, in the hopes that we might find a way to make it work one day.
|
||||
ALMOST_REGEX = re.compile(
|
||||
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
|
||||
r"((?P<correspondent>{non_separated_word}+){separator})??"
|
||||
r"(?P<title>{non_separated_word}+)"
|
||||
r"({separator}(?P<tags>[a-z,0-9-]+))?"
|
||||
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
|
||||
separator=r"\s+-\s+",
|
||||
non_separated_word=r"([\w,. ]|([^\s]-))"
|
||||
)
|
||||
)
|
||||
REGEXES = OrderedDict([
|
||||
("created-correspondent-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title-tags", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-correspondent-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("created-title", re.compile(
|
||||
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title-tags", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*) - "
|
||||
r"(?P<tags>[a-z0-9\-,]*)$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("correspondent-title", re.compile(
|
||||
r"(?P<correspondent>.*) - "
|
||||
r"(?P<title>.*)?$",
|
||||
flags=re.IGNORECASE
|
||||
)),
|
||||
("title", re.compile(
|
||||
r"(?P<title>.*)$",
|
||||
flags=re.IGNORECASE
|
||||
@@ -365,28 +448,10 @@ class FileInfo:
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _get_correspondent(cls, name):
|
||||
if not name:
|
||||
return None
|
||||
return Correspondent.objects.get_or_create(name=name, defaults={
|
||||
"slug": slugify(name)
|
||||
})[0]
|
||||
|
||||
@classmethod
|
||||
def _get_title(cls, title):
|
||||
return title
|
||||
|
||||
@classmethod
|
||||
def _get_tags(cls, tags):
|
||||
r = []
|
||||
for t in tags.split(","):
|
||||
r.append(Tag.objects.get_or_create(
|
||||
slug=slugify(t),
|
||||
defaults={"name": t}
|
||||
)[0])
|
||||
return tuple(r)
|
||||
|
||||
@classmethod
|
||||
def _mangle_property(cls, properties, name):
|
||||
if name in properties:
|
||||
@@ -396,15 +461,6 @@ class FileInfo:
|
||||
|
||||
@classmethod
|
||||
def from_filename(cls, filename):
|
||||
"""
|
||||
We use a crude naming convention to make handling the correspondent,
|
||||
title, and tags easier:
|
||||
"<date> - <correspondent> - <title> - <tags>"
|
||||
"<correspondent> - <title> - <tags>"
|
||||
"<correspondent> - <title>"
|
||||
"<title>"
|
||||
"""
|
||||
|
||||
# Mutate filename in-place before parsing its components
|
||||
# by applying at most one of the configured transformations.
|
||||
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
|
||||
@@ -435,7 +491,5 @@ class FileInfo:
|
||||
if m:
|
||||
properties = m.groupdict()
|
||||
cls._mangle_property(properties, "created")
|
||||
cls._mangle_property(properties, "correspondent")
|
||||
cls._mangle_property(properties, "title")
|
||||
cls._mangle_property(properties, "tags")
|
||||
return cls(**properties)
|
||||
|
Reference in New Issue
Block a user