Implemented the classifier model, including automatic tagging of new documents

2025-09-12 21:35:40 -05:00 · 2018-09-04 14:39:55 +02:00
parent 3eecd67fc1
commit c50c517928
10 changed files with 240 additions and 339 deletions
--- a/src/documents/management/commands/document_correspondents.py
+++ b/src/documents/management/commands/document_correspondents.py
@@ -1,82 +0,0 @@
-import sys
-
-from django.core.management.base import BaseCommand
-
-from documents.models import Correspondent, Document
-
-from ...mixins import Renderable
-
-
-class Command(Renderable, BaseCommand):
-
-    help = """
-        Using the current set of correspondent rules, apply said rules to all
-        documents in the database, effectively allowing you to back-tag all
-        previously indexed documents with correspondent created (or modified)
-        after their initial import.
-    """.replace("    ", "")
-
-    TOO_MANY_CONTINUE = (
-        "Detected {} potential correspondents for {}, so we've opted for {}")
-    TOO_MANY_SKIP = (
-        "Detected {} potential correspondents for {}, so we're skipping it")
-    CHANGE_MESSAGE = (
-        'Document {}: "{}" was given the correspondent id {}: "{}"')
-
-    def __init__(self, *args, **kwargs):
-        self.verbosity = 0
-        BaseCommand.__init__(self, *args, **kwargs)
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-            "--use-first",
-            default=False,
-            action="store_true",
-            help="By default this command won't try to assign a correspondent "
-                 "if more than one matches the document.  Use this flag if "
-                 "you'd rather it just pick the first one it finds."
-        )
-
-    def handle(self, *args, **options):
-
-        self.verbosity = options["verbosity"]
-
-        for document in Document.objects.filter(correspondent__isnull=True).exclude(tags__is_archived_tag=True):
-
-            potential_correspondents = list(
-                Correspondent.match_all(document.content))
-
-            if not potential_correspondents:
-                continue
-
-            potential_count = len(potential_correspondents)
-            correspondent = potential_correspondents[0]
-
-            if potential_count > 1:
-                if not options["use_first"]:
-                    print(
-                        self.TOO_MANY_SKIP.format(potential_count, document),
-                        file=sys.stderr
-                    )
-                    continue
-                print(
-                    self.TOO_MANY_CONTINUE.format(
-                        potential_count,
-                        document,
-                        correspondent
-                    ),
-                    file=sys.stderr
-                )
-
-            document.correspondent = correspondent
-            document.save(update_fields=("correspondent",))
-
-            print(
-                self.CHANGE_MESSAGE.format(
-                    document.pk,
-                    document.title,
-                    correspondent.pk,
-                    correspondent.name
-                ),
-                file=sys.stderr
-            )
--- a/src/documents/management/commands/document_create_classifier.py
+++ b/src/documents/management/commands/document_create_classifier.py
@@ -1,100 +1,84 @@
-import logging
-import os.path
-import pickle
-
-from django.core.management.base import BaseCommand
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.naive_bayes import MultinomialNB
-from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
-
-from documents.models import Document
-from ...mixins import Renderable
-
-
-def preprocess_content(content):
-    content = content.lower()
-    content = content.strip()
-    content = content.replace("\n", " ")
-    content = content.replace("\r", " ")
-    while content.find("  ") > -1:
-        content = content.replace("  ", " ")
-    return content
-
-
-class Command(Renderable, BaseCommand):
-
-    help = """
-        There is no help.
-    """.replace("    ", "")
-
-    def __init__(self, *args, **kwargs):
-        BaseCommand.__init__(self, *args, **kwargs)
-
-    def handle(self, *args, **options):
-        data = list()
-        labels_tags = list()
-        labels_correspondent = list()
-        labels_type = list()
-
-        # Step 1: Extract and preprocess training data from the database.
-        logging.getLogger(__name__).info("Gathering data from database...")
-        for doc in Document.objects.exclude(tags__is_inbox_tag=True):
-            data.append(preprocess_content(doc.content))
-            labels_type.append(doc.document_type.name if doc.document_type is not None else "-")
-            labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None else "-")
-            tags = [tag.name for tag in doc.tags.all()]
-            labels_tags.append(tags)
-
-        # Step 2: vectorize data
-        logging.getLogger(__name__).info("Vectorizing data...")
-        data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05)
-        data_vectorized = data_vectorizer.fit_transform(data)
-
-        tags_binarizer = MultiLabelBinarizer()
-        labels_tags_vectorized = tags_binarizer.fit_transform(labels_tags)
-
-        correspondent_binarizer = LabelEncoder()
-        labels_correspondent_vectorized = correspondent_binarizer.fit_transform(labels_correspondent)
-
-        type_binarizer = LabelEncoder()
-        labels_type_vectorized = type_binarizer.fit_transform(labels_type)
-
-        # Step 3: train the classifiers
-        if len(tags_binarizer.classes_) > 0:
-            logging.getLogger(__name__).info("Training tags classifier")
-            tags_classifier = OneVsRestClassifier(MultinomialNB())
-            tags_classifier.fit(data_vectorized, labels_tags_vectorized)
-        else:
-            tags_classifier = None
-            logging.getLogger(__name__).info("There are no tags. Not training tags classifier.")
-
-        if len(correspondent_binarizer.classes_) > 0:
-            logging.getLogger(__name__).info("Training correspondent classifier")
-            correspondent_classifier = MultinomialNB()
-            correspondent_classifier.fit(data_vectorized, labels_correspondent_vectorized)
-        else:
-            correspondent_classifier = None
-            logging.getLogger(__name__).info("There are no correspondents. Not training correspondent classifier.")
-
-        if len(type_binarizer.classes_) > 0:
-            logging.getLogger(__name__).info("Training document type classifier")
-            type_classifier = MultinomialNB()
-            type_classifier.fit(data_vectorized, labels_type_vectorized)
-        else:
-            type_classifier = None
-            logging.getLogger(__name__).info("There are no document types. Not training document type classifier.")
-
-        models_root = os.path.abspath(os.path.join(os.path.dirname(__name__), "..", "models", "models.pickle"))
-        logging.getLogger(__name__).info("Saving models to " + models_root + "...")
-
-        with open(models_root, "wb") as f:
-            pickle.dump(data_vectorizer, f)
-
-            pickle.dump(tags_binarizer, f)
-            pickle.dump(correspondent_binarizer, f)
-            pickle.dump(type_binarizer, f)
-
-            pickle.dump(tags_classifier, f)
-            pickle.dump(correspondent_classifier, f)
-            pickle.dump(type_classifier, f)
+import logging
+import os.path
+import pickle
+
+from django.core.management.base import BaseCommand
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
+
+from documents.classifier import preprocess_content, DocumentClassifier
+from documents.models import Document
+from paperless import settings
+from ...mixins import Renderable
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        There is no help.
+    """.replace("    ", "")
+
+    def __init__(self, *args, **kwargs):
+        BaseCommand.__init__(self, *args, **kwargs)
+
+    def handle(self, *args, **options):
+        clf = DocumentClassifier()
+
+        data = list()
+        labels_tags = list()
+        labels_correspondent = list()
+        labels_type = list()
+
+        # Step 1: Extract and preprocess training data from the database.
+        logging.getLogger(__name__).info("Gathering data from database...")
+        for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+            data.append(preprocess_content(doc.content))
+            labels_type.append(doc.document_type.name if doc.document_type is not None else "-")
+            labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None else "-")
+            tags = [tag.name for tag in doc.tags.all()]
+            labels_tags.append(tags)
+
+        # Step 2: vectorize data
+        logging.getLogger(__name__).info("Vectorizing data...")
+        clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05)
+        data_vectorized = clf.data_vectorizer.fit_transform(data)
+
+        clf.tags_binarizer = MultiLabelBinarizer()
+        labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)
+
+        clf.correspondent_binarizer = LabelEncoder()
+        labels_correspondent_vectorized = clf.correspondent_binarizer.fit_transform(labels_correspondent)
+
+        clf.type_binarizer = LabelEncoder()
+        labels_type_vectorized = clf.type_binarizer.fit_transform(labels_type)
+
+        # Step 3: train the classifiers
+        if len(clf.tags_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info("Training tags classifier")
+            clf.tags_classifier = OneVsRestClassifier(MultinomialNB())
+            clf.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
+        else:
+            clf.tags_classifier = None
+            logging.getLogger(__name__).info("There are no tags. Not training tags classifier.")
+
+        if len(clf.correspondent_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info("Training correspondent classifier")
+            clf.correspondent_classifier = MultinomialNB()
+            clf.correspondent_classifier.fit(data_vectorized, labels_correspondent_vectorized)
+        else:
+            clf.correspondent_classifier = None
+            logging.getLogger(__name__).info("There are no correspondents. Not training correspondent classifier.")
+
+        if len(clf.type_binarizer.classes_) > 0:
+            logging.getLogger(__name__).info("Training document type classifier")
+            clf.type_classifier = MultinomialNB()
+            clf.type_classifier.fit(data_vectorized, labels_type_vectorized)
+        else:
+            clf.type_classifier = None
+            logging.getLogger(__name__).info("There are no document types. Not training document type classifier.")
+
+        logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...")
+
+        clf.save_classifier()
--- a/src/documents/management/commands/document_create_dataset.py
+++ b/src/documents/management/commands/document_create_dataset.py
@@ -1,49 +1,40 @@
-from django.core.management.base import BaseCommand
-
-from documents.models import Document
-from ...mixins import Renderable
-
-
-def preprocess_content(content):
-    content = content.lower()
-    content = content.strip()
-    content = content.replace("\n", " ")
-    content = content.replace("\r", " ")
-    while content.find("  ") > -1:
-        content = content.replace("  ", " ")
-    return content
-
-
-class Command(Renderable, BaseCommand):
-
-    help = """
-        There is no help.
-    """.replace("    ", "")
-
-    def __init__(self, *args, **kwargs):
-        BaseCommand.__init__(self, *args, **kwargs)
-
-    def handle(self, *args, **options):
-        with open("dataset_tags.txt", "w") as f:
-            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
-                labels = []
-                for tag in doc.tags.all():
-                    labels.append(tag.name)
-                f.write(",".join(labels))
-                f.write(";")
-                f.write(preprocess_content(doc.content))
-                f.write("\n")
-
-        with open("dataset_types.txt", "w") as f:
-            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
-                f.write(doc.document_type.name if doc.document_type is not None else "None")
-                f.write(";")
-                f.write(preprocess_content(doc.content))
-                f.write("\n")
-
-        with open("dataset_correspondents.txt", "w") as f:
-            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
-                f.write(doc.correspondent.name if doc.correspondent is not None else "None")
-                f.write(";")
-                f.write(preprocess_content(doc.content))
-                f.write("\n")
+from django.core.management.base import BaseCommand
+
+from documents.classifier import preprocess_content
+from documents.models import Document
+from ...mixins import Renderable
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        There is no help.
+    """.replace("    ", "")
+
+    def __init__(self, *args, **kwargs):
+        BaseCommand.__init__(self, *args, **kwargs)
+
+    def handle(self, *args, **options):
+        with open("dataset_tags.txt", "w") as f:
+            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+                labels = []
+                for tag in doc.tags.all():
+                    labels.append(tag.name)
+                f.write(",".join(labels))
+                f.write(";")
+                f.write(preprocess_content(doc.content))
+                f.write("\n")
+
+        with open("dataset_types.txt", "w") as f:
+            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+                f.write(doc.document_type.name if doc.document_type is not None else "None")
+                f.write(";")
+                f.write(preprocess_content(doc.content))
+                f.write("\n")
+
+        with open("dataset_correspondents.txt", "w") as f:
+            for doc in Document.objects.exclude(tags__is_inbox_tag=True):
+                f.write(doc.correspondent.name if doc.correspondent is not None else "None")
+                f.write(";")
+                f.write(preprocess_content(doc.content))
+                f.write("\n")
--- a/src/documents/management/commands/document_retagger.py
+++ b/src/documents/management/commands/document_retagger.py
@@ -1,5 +1,8 @@
+import logging
+
 from django.core.management.base import BaseCommand

+from documents.classifier import DocumentClassifier
 from documents.models import Document, Tag

 from ...mixins import Renderable
@@ -8,25 +11,44 @@ from ...mixins import Renderable
 class Command(Renderable, BaseCommand):

    help = """
-        Using the current set of tagging rules, apply said rules to all
-        documents in the database, effectively allowing you to back-tag all
-        previously indexed documents with tags created (or modified) after
-        their initial import.
+        There is no help. #TODO
    """.replace("    ", "")

    def __init__(self, *args, **kwargs):
        self.verbosity = 0
        BaseCommand.__init__(self, *args, **kwargs)

+    def add_arguments(self, parser):
+        parser.add_argument(
+            "-c", "--correspondent",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-T", "--tags",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-t", "--type",
+            action="store_true"
+        )
+        parser.add_argument(
+            "-i", "--inbox-only",
+            action="store_true"
+        )
+
    def handle(self, *args, **options):

        self.verbosity = options["verbosity"]

-        for document in Document.objects.all().exclude(tags__is_archived_tag=True):
+        if options['inbox_only']:
+            documents = Document.objects.filter(tags__is_inbox_tag=True).distinct()
+        else:
+            documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()

-            tags = Tag.objects.exclude(
-                pk__in=document.tags.values_list("pk", flat=True))
+        logging.getLogger(__name__).info("Loading classifier")
+        clf = DocumentClassifier.load_classifier()

-            for tag in Tag.match_all(document.content, tags):
-                print('Tagging {} with "{}"'.format(document, tag))
-                document.tags.add(tag)
+
+        for document in documents:
+            logging.getLogger(__name__).info("Processing document {}".format(document.title))
+            clf.classify_document(document, classify_type=options['type'], classify_tags=options['tags'], classify_correspondent=options['correspondent'])