implemented automatic classification field functionality

2026-02-09 23:49:29 -06:00 · 2018-09-05 14:31:02 +02:00
parent 582e9c5cb4
commit bbba57dd4d
2 changed files with 7 additions and 5 deletions
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -43,7 +43,7 @@ class DocumentClassifier(object):

    def reload(self):
        if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
-            print("reloading classifier")
+            logging.getLogger(__name__).info("Reloading classifier models")
            with open(settings.MODEL_FILE, "rb") as f:
                self.data_vectorizer = pickle.load(f)
                self.tags_binarizer = pickle.load(f)
@@ -77,11 +77,14 @@ class DocumentClassifier(object):
        logging.getLogger(__name__).info("Gathering data from database...")
        for doc in Document.objects.exclude(tags__is_inbox_tag=True):
            data.append(preprocess_content(doc.content))
-            labels_type.append(doc.document_type.name if doc.document_type is not None else "-")
-            labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None else "-")
-            tags = [tag.name for tag in doc.tags.all()]
+            labels_type.append(doc.document_type.name if doc.document_type is not None and doc.document_type.automatic_classification else "-")
+            labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None and doc.correspondent.automatic_classification else "-")
+            tags = [tag.name for tag in doc.tags.filter(automatic_classification=True)]
            labels_tags.append(tags)

+        labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
+        logging.getLogger(__name__).info("{} documents, {} tag(s) {}, {} correspondent(s) {}, {} type(s) {}.".format(len(data), len(labels_tags_unique), labels_tags_unique, len(set(labels_correspondent)), set(labels_correspondent), len(set(labels_type)), set(labels_type)))
+
        # Step 2: vectorize data
        logging.getLogger(__name__).info("Vectorizing data...")
        self.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
--- a/src/documents/management/commands/document_create_classifier.py
+++ b/src/documents/management/commands/document_create_classifier.py
@@ -22,7 +22,6 @@ class Command(Renderable, BaseCommand):

        clf.train()

-
        logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...")

        clf.save_classifier()