removed matching model fields, automatic classifier reloading, added autmatic_classification field to matching model

2025-07-30 18:27:45 -05:00 · 2018-09-04 18:40:26 +02:00
parent 30134034e2
commit 70bd05450a
8 changed files with 126 additions and 143 deletions
--- a/src/documents/admin.py
+++ b/src/documents/admin.py
@@ -102,9 +102,8 @@ class CommonAdmin(admin.ModelAdmin):
 class CorrespondentAdmin(CommonAdmin):
-    list_display = ("name", "match", "matching_algorithm", "document_count", "last_correspondence")
+    list_display = ("name", "automatic_classification", "document_count", "last_correspondence")
-    list_filter = ("matching_algorithm",)
+    list_editable = ("automatic_classification",)
    list_editable = ("match", "matching_algorithm")
    def get_queryset(self, request):
        qs = super(CorrespondentAdmin, self).get_queryset(request)
@@ -122,10 +121,9 @@ class CorrespondentAdmin(CommonAdmin):
 class TagAdmin(CommonAdmin):
-    list_display = ("name", "colour", "match", "matching_algorithm",
+    list_display = ("name", "colour", "automatic_classification", "document_count")
-                    "document_count")
+    list_filter = ("colour",)
-    list_filter = ("colour", "matching_algorithm")
+    list_editable = ("colour", "automatic_classification")
    list_editable = ("colour", "match", "matching_algorithm")
    def get_queryset(self, request):
        qs = super(TagAdmin, self).get_queryset(request)
@@ -139,9 +137,8 @@ class TagAdmin(CommonAdmin):
 class DocumentTypeAdmin(CommonAdmin):
-    list_display = ("name", "match", "matching_algorithm", "document_count")
+    list_display = ("name", "automatic_classification", "document_count")
-    list_filter = ("matching_algorithm",)
+    list_editable = ("automatic_classification",)
    list_editable = ("match", "matching_algorithm")
    def get_queryset(self, request):
        qs = super(DocumentTypeAdmin, self).get_queryset(request)
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -1,3 +1,4 @@
 import os
 import pickle
 from documents.models import Correspondent, DocumentType, Tag
@@ -16,6 +17,18 @@ def preprocess_content(content):
 class DocumentClassifier(object):
    classifier_version = None
    data_vectorizer = None
    tags_binarizer = None
    correspondent_binarizer = None
    type_binarizer = None
    tags_classifier = None
    correspondent_classifier = None
    type_classifier = None
    @staticmethod
    def load_classifier():
        clf = DocumentClassifier()
@@ -23,15 +36,18 @@ class DocumentClassifier(object):
        return clf
    def reload(self):
-        with open(settings.MODEL_FILE, "rb") as f:
+        if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
-            self.data_vectorizer = pickle.load(f)
+            print("reloading classifier")
-            self.tags_binarizer = pickle.load(f)
+            with open(settings.MODEL_FILE, "rb") as f:
-            self.correspondent_binarizer = pickle.load(f)
+                self.data_vectorizer = pickle.load(f)
-            self.type_binarizer = pickle.load(f)
+                self.tags_binarizer = pickle.load(f)
                self.correspondent_binarizer = pickle.load(f)
                self.type_binarizer = pickle.load(f)
-            self.tags_classifier = pickle.load(f)
+                self.tags_classifier = pickle.load(f)
-            self.correspondent_classifier = pickle.load(f)
+                self.correspondent_classifier = pickle.load(f)
-            self.type_classifier = pickle.load(f)
+                self.type_classifier = pickle.load(f)
            self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
    def save_classifier(self):
        with open(settings.MODEL_FILE, "wb") as f:
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -221,12 +221,6 @@ class Consumer:
                storage_type=self.storage_type
            )
        relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
        if relevant_tags:
            tag_names = ", ".join([t.slug for t in relevant_tags])
            self.log("debug", "Tagging with {}".format(tag_names))
            document.tags.add(*relevant_tags)
        self._write(document, doc, document.source_path)
        self._write(document, thumbnail, document.thumbnail_path)
--- a/src/documents/management/commands/document_create_classifier.py
+++ b/src/documents/management/commands/document_create_classifier.py
@@ -42,9 +42,14 @@ class Command(Renderable, BaseCommand):
        # Step 2: vectorize data
        logging.getLogger(__name__).info("Vectorizing data...")
-        clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05)
+        clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
        data_vectorized = clf.data_vectorizer.fit_transform(data)
        print(clf.data_vectorizer.vocabulary_)
        logging.getLogger(__name__).info("Shape of vectorized data: {}".format(data_vectorized.shape))
        clf.tags_binarizer = MultiLabelBinarizer()
        labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -46,7 +46,11 @@ class Command(Renderable, BaseCommand):
            documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()
        logging.getLogger(__name__).info("Loading classifier")
-        clf = DocumentClassifier.load_classifier()
+        try:
            clf = DocumentClassifier.load_classifier()
        except FileNotFoundError:
            logging.getLogger(__name__).fatal("Cannot classify documents, classifier model file was not found.")
            return
        for document in documents:
--- a/src/documents/migrations/0024_auto_20180904_1425.py
+++ b/src/documents/migrations/0024_auto_20180904_1425.py
@@ -0,0 +1,77 @@
 # Generated by Django 2.0.8 on 2018-09-04 14:25
 from django.db import migrations, models
 def transfer_automatic_classification(apps, schema_editor):
    for model_name in ["Tag", "Correspondent", "DocumentType"]:
        model_class = apps.get_model("documents", model_name)
        for o in model_class.objects.all():
            o.automatic_classification = o.match is not None and len(o.match) > 0
            o.save()
 def reverse_automatic_classification(apps, schema_editor):
    pass
 class Migration(migrations.Migration):
    dependencies = [
        ('documents', '0023_auto_20180823_1155'),
    ]
    operations = [
        migrations.AddField(
            model_name='correspondent',
            name='automatic_classification',
            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
        ),
        migrations.AddField(
            model_name='documenttype',
            name='automatic_classification',
            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
        ),
        migrations.AddField(
            model_name='tag',
            name='automatic_classification',
            field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
        ),
        migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
        migrations.RemoveField(
            model_name='correspondent',
            name='is_insensitive',
        ),
        migrations.RemoveField(
            model_name='correspondent',
            name='match',
        ),
        migrations.RemoveField(
            model_name='correspondent',
            name='matching_algorithm',
        ),
        migrations.RemoveField(
            model_name='documenttype',
            name='is_insensitive',
        ),
        migrations.RemoveField(
            model_name='documenttype',
            name='match',
        ),
        migrations.RemoveField(
            model_name='documenttype',
            name='matching_algorithm',
        ),
        migrations.RemoveField(
            model_name='tag',
            name='is_insensitive',
        ),
        migrations.RemoveField(
            model_name='tag',
            name='match',
        ),
        migrations.RemoveField(
            model_name='tag',
            name='matching_algorithm',
        ),
    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -15,48 +15,15 @@ from django.db import models
 from django.template.defaultfilters import slugify
 from django.utils import timezone
 from reminders.models import Reminder
 from .managers import LogManager
 class MatchingModel(models.Model):
    MATCH_ANY = 1
    MATCH_ALL = 2
    MATCH_LITERAL = 3
    MATCH_REGEX = 4
    MATCH_FUZZY = 5
    MATCHING_ALGORITHMS = (
        (MATCH_ANY, "Any"),
        (MATCH_ALL, "All"),
        (MATCH_LITERAL, "Literal"),
        (MATCH_REGEX, "Regular Expression"),
        (MATCH_FUZZY, "Fuzzy Match"),
    )
    name = models.CharField(max_length=128, unique=True)
    slug = models.SlugField(blank=True)
-    match = models.CharField(max_length=256, blank=True)
+    automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')
    matching_algorithm = models.PositiveIntegerField(
        choices=MATCHING_ALGORITHMS,
        default=MATCH_ANY,
        help_text=(
            "Which algorithm you want to use when matching text to the OCR'd "
            "PDF.  Here, \"any\" looks for any occurrence of any word "
            "provided in the PDF, while \"all\" requires that every word "
            "provided appear in the PDF, albeit not in the order provided.  A "
            "\"literal\" match means that the text you enter must appear in "
            "the PDF exactly as you've entered it, and \"regular expression\" "
            "uses a regex to match the PDF.  (If you don't know what a regex "
            "is, you probably don't want this option.)  Finally, a \"fuzzy "
            "match\" looks for words or phrases that are mostly—but not "
            "exactly—the same, which can be useful for matching against "
            "documents containg imperfections that foil accurate OCR."
        )
    )
    is_insensitive = models.BooleanField(default=True)
    class Meta:
        abstract = True
@@ -64,87 +31,8 @@ class MatchingModel(models.Model):
    def __str__(self):
        return self.name
    @property
    def conditions(self):
        return "{}: \"{}\" ({})".format(
            self.name, self.match, self.get_matching_algorithm_display())
    @classmethod
    def match_all(cls, text, tags=None):
        if tags is None:
            tags = cls.objects.all()
        text = text.lower()
        for tag in tags:
            if tag.matches(text):
                yield tag
    def matches(self, text):
        search_kwargs = {}
        # Check that match is not empty
        if self.match.strip() == "":
            return False
        if self.is_insensitive:
            search_kwargs = {"flags": re.IGNORECASE}
        if self.matching_algorithm == self.MATCH_ALL:
            for word in self._split_match():
                search_result = re.search(
                    r"\b{}\b".format(word), text, **search_kwargs)
                if not search_result:
                    return False
            return True
        if self.matching_algorithm == self.MATCH_ANY:
            for word in self._split_match():
                if re.search(r"\b{}\b".format(word), text, **search_kwargs):
                    return True
            return False
        if self.matching_algorithm == self.MATCH_LITERAL:
            return bool(re.search(
                r"\b{}\b".format(self.match), text, **search_kwargs))
        if self.matching_algorithm == self.MATCH_REGEX:
            return bool(re.search(
                re.compile(self.match, **search_kwargs), text))
        if self.matching_algorithm == self.MATCH_FUZZY:
            match = re.sub(r'[^\w\s]', '', self.match)
            text = re.sub(r'[^\w\s]', '', text)
            if self.is_insensitive:
                match = match.lower()
                text = text.lower()
            return True if fuzz.partial_ratio(match, text) >= 90 else False
        raise NotImplementedError("Unsupported matching algorithm")
    def _split_match(self):
        """
        Splits the match to individual keywords, getting rid of unnecessary
        spaces and grouping quoted words together.
        Example:
          '  some random  words "with   quotes  " and   spaces'
            ==>
          ["some", "random", "words", "with\s+quotes", "and", "spaces"]
        """
        findterms = re.compile(r'"([^"]+)"|(\S+)').findall
        normspace = re.compile(r"\s+").sub
        return [
            normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
            for t in findterms(self.match)
        ]
    def save(self, *args, **kwargs):
        self.match = self.match.lower()
        if not self.slug:
            self.slug = slugify(self.name)
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -16,15 +16,17 @@ def logger(message, group):
    logging.getLogger(__name__).debug(message, extra={"group": group})
-classifier = None
+classifier = DocumentClassifier()
 def classify_document(sender, document=None, logging_group=None, **kwargs):
    global classifier
-    if classifier is None:
+    try:
-        classifier = DocumentClassifier.load_classifier()
+        classifier.reload()
        classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
    except FileNotFoundError:
        logging.getLogger(__name__).fatal("Cannot classify document, classifier model file was not found.")
    classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)