unified document matching, legacy and automatching work alongside now

This commit is contained in:
Jonas Winkler
2020-10-28 11:45:11 +01:00
parent 9e4147ac52
commit 11af74ba36
16 changed files with 629 additions and 225 deletions

View File

@@ -7,13 +7,11 @@ import uuid
from collections import OrderedDict
import dateutil.parser
from django.dispatch import receiver
from django.conf import settings
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone
from django.utils.text import slugify
from collections import defaultdict
from .managers import LogManager
@@ -25,15 +23,46 @@ except ImportError:
class MatchingModel(models.Model):
MATCH_ANY = 1
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCH_AUTO = 6
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
(MATCH_AUTO, "Automatic Classification"),
)
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True, editable=False)
automatic_classification = models.BooleanField(
default=False,
help_text="Automatically assign to newly added documents based on "
"current usage in your document collection."
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
choices=MATCHING_ALGORITHMS,
default=MATCH_ANY,
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
"PDF. Here, \"any\" looks for any occurrence of any word "
"provided in the PDF, while \"all\" requires that every word "
"provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" looks for words or phrases that are mostly—but not "
"exactly—the same, which can be useful for matching against "
"documents containg imperfections that foil accurate OCR."
)
)
is_insensitive = models.BooleanField(default=True)
class Meta:
abstract = True
ordering = ("name",)
@@ -43,6 +72,7 @@ class MatchingModel(models.Model):
def save(self, *args, **kwargs):
self.match = self.match.lower()
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)