Implemented the classifier model, including automatic tagging of new documents

This commit is contained in:
Jonas Winkler
2018-09-04 14:39:55 +02:00
parent 3eecd67fc1
commit c50c517928
10 changed files with 240 additions and 339 deletions

View File

@@ -1,82 +0,0 @@
import sys
from django.core.management.base import BaseCommand
from documents.models import Correspondent, Document
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
help = """
Using the current set of correspondent rules, apply said rules to all
documents in the database, effectively allowing you to back-tag all
previously indexed documents with correspondent created (or modified)
after their initial import.
""".replace(" ", "")
TOO_MANY_CONTINUE = (
"Detected {} potential correspondents for {}, so we've opted for {}")
TOO_MANY_SKIP = (
"Detected {} potential correspondents for {}, so we're skipping it")
CHANGE_MESSAGE = (
'Document {}: "{}" was given the correspondent id {}: "{}"')
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument(
"--use-first",
default=False,
action="store_true",
help="By default this command won't try to assign a correspondent "
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds."
)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
for document in Document.objects.filter(correspondent__isnull=True).exclude(tags__is_archived_tag=True):
potential_correspondents = list(
Correspondent.match_all(document.content))
if not potential_correspondents:
continue
potential_count = len(potential_correspondents)
correspondent = potential_correspondents[0]
if potential_count > 1:
if not options["use_first"]:
print(
self.TOO_MANY_SKIP.format(potential_count, document),
file=sys.stderr
)
continue
print(
self.TOO_MANY_CONTINUE.format(
potential_count,
document,
correspondent
),
file=sys.stderr
)
document.correspondent = correspondent
document.save(update_fields=("correspondent",))
print(
self.CHANGE_MESSAGE.format(
document.pk,
document.title,
correspondent.pk,
correspondent.name
),
file=sys.stderr
)

View File

@@ -1,100 +1,84 @@
import logging
import os.path
import pickle
from django.core.management.base import BaseCommand
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from documents.models import Document
from ...mixins import Renderable
def preprocess_content(content):
content = content.lower()
content = content.strip()
content = content.replace("\n", " ")
content = content.replace("\r", " ")
while content.find(" ") > -1:
content = content.replace(" ", " ")
return content
class Command(Renderable, BaseCommand):
help = """
There is no help.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
data = list()
labels_tags = list()
labels_correspondent = list()
labels_type = list()
# Step 1: Extract and preprocess training data from the database.
logging.getLogger(__name__).info("Gathering data from database...")
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
data.append(preprocess_content(doc.content))
labels_type.append(doc.document_type.name if doc.document_type is not None else "-")
labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None else "-")
tags = [tag.name for tag in doc.tags.all()]
labels_tags.append(tags)
# Step 2: vectorize data
logging.getLogger(__name__).info("Vectorizing data...")
data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05)
data_vectorized = data_vectorizer.fit_transform(data)
tags_binarizer = MultiLabelBinarizer()
labels_tags_vectorized = tags_binarizer.fit_transform(labels_tags)
correspondent_binarizer = LabelEncoder()
labels_correspondent_vectorized = correspondent_binarizer.fit_transform(labels_correspondent)
type_binarizer = LabelEncoder()
labels_type_vectorized = type_binarizer.fit_transform(labels_type)
# Step 3: train the classifiers
if len(tags_binarizer.classes_) > 0:
logging.getLogger(__name__).info("Training tags classifier")
tags_classifier = OneVsRestClassifier(MultinomialNB())
tags_classifier.fit(data_vectorized, labels_tags_vectorized)
else:
tags_classifier = None
logging.getLogger(__name__).info("There are no tags. Not training tags classifier.")
if len(correspondent_binarizer.classes_) > 0:
logging.getLogger(__name__).info("Training correspondent classifier")
correspondent_classifier = MultinomialNB()
correspondent_classifier.fit(data_vectorized, labels_correspondent_vectorized)
else:
correspondent_classifier = None
logging.getLogger(__name__).info("There are no correspondents. Not training correspondent classifier.")
if len(type_binarizer.classes_) > 0:
logging.getLogger(__name__).info("Training document type classifier")
type_classifier = MultinomialNB()
type_classifier.fit(data_vectorized, labels_type_vectorized)
else:
type_classifier = None
logging.getLogger(__name__).info("There are no document types. Not training document type classifier.")
models_root = os.path.abspath(os.path.join(os.path.dirname(__name__), "..", "models", "models.pickle"))
logging.getLogger(__name__).info("Saving models to " + models_root + "...")
with open(models_root, "wb") as f:
pickle.dump(data_vectorizer, f)
pickle.dump(tags_binarizer, f)
pickle.dump(correspondent_binarizer, f)
pickle.dump(type_binarizer, f)
pickle.dump(tags_classifier, f)
pickle.dump(correspondent_classifier, f)
pickle.dump(type_classifier, f)
import logging
import os.path
import pickle
from django.core.management.base import BaseCommand
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from documents.classifier import preprocess_content, DocumentClassifier
from documents.models import Document
from paperless import settings
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
help = """
There is no help.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
clf = DocumentClassifier()
data = list()
labels_tags = list()
labels_correspondent = list()
labels_type = list()
# Step 1: Extract and preprocess training data from the database.
logging.getLogger(__name__).info("Gathering data from database...")
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
data.append(preprocess_content(doc.content))
labels_type.append(doc.document_type.name if doc.document_type is not None else "-")
labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None else "-")
tags = [tag.name for tag in doc.tags.all()]
labels_tags.append(tags)
# Step 2: vectorize data
logging.getLogger(__name__).info("Vectorizing data...")
clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05)
data_vectorized = clf.data_vectorizer.fit_transform(data)
clf.tags_binarizer = MultiLabelBinarizer()
labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)
clf.correspondent_binarizer = LabelEncoder()
labels_correspondent_vectorized = clf.correspondent_binarizer.fit_transform(labels_correspondent)
clf.type_binarizer = LabelEncoder()
labels_type_vectorized = clf.type_binarizer.fit_transform(labels_type)
# Step 3: train the classifiers
if len(clf.tags_binarizer.classes_) > 0:
logging.getLogger(__name__).info("Training tags classifier")
clf.tags_classifier = OneVsRestClassifier(MultinomialNB())
clf.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
else:
clf.tags_classifier = None
logging.getLogger(__name__).info("There are no tags. Not training tags classifier.")
if len(clf.correspondent_binarizer.classes_) > 0:
logging.getLogger(__name__).info("Training correspondent classifier")
clf.correspondent_classifier = MultinomialNB()
clf.correspondent_classifier.fit(data_vectorized, labels_correspondent_vectorized)
else:
clf.correspondent_classifier = None
logging.getLogger(__name__).info("There are no correspondents. Not training correspondent classifier.")
if len(clf.type_binarizer.classes_) > 0:
logging.getLogger(__name__).info("Training document type classifier")
clf.type_classifier = MultinomialNB()
clf.type_classifier.fit(data_vectorized, labels_type_vectorized)
else:
clf.type_classifier = None
logging.getLogger(__name__).info("There are no document types. Not training document type classifier.")
logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...")
clf.save_classifier()

View File

@@ -1,49 +1,40 @@
from django.core.management.base import BaseCommand
from documents.models import Document
from ...mixins import Renderable
def preprocess_content(content):
content = content.lower()
content = content.strip()
content = content.replace("\n", " ")
content = content.replace("\r", " ")
while content.find(" ") > -1:
content = content.replace(" ", " ")
return content
class Command(Renderable, BaseCommand):
help = """
There is no help.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
with open("dataset_tags.txt", "w") as f:
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
labels = []
for tag in doc.tags.all():
labels.append(tag.name)
f.write(",".join(labels))
f.write(";")
f.write(preprocess_content(doc.content))
f.write("\n")
with open("dataset_types.txt", "w") as f:
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
f.write(doc.document_type.name if doc.document_type is not None else "None")
f.write(";")
f.write(preprocess_content(doc.content))
f.write("\n")
with open("dataset_correspondents.txt", "w") as f:
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
f.write(doc.correspondent.name if doc.correspondent is not None else "None")
f.write(";")
f.write(preprocess_content(doc.content))
f.write("\n")
from django.core.management.base import BaseCommand
from documents.classifier import preprocess_content
from documents.models import Document
from ...mixins import Renderable
class Command(Renderable, BaseCommand):
help = """
There is no help.
""".replace(" ", "")
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
with open("dataset_tags.txt", "w") as f:
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
labels = []
for tag in doc.tags.all():
labels.append(tag.name)
f.write(",".join(labels))
f.write(";")
f.write(preprocess_content(doc.content))
f.write("\n")
with open("dataset_types.txt", "w") as f:
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
f.write(doc.document_type.name if doc.document_type is not None else "None")
f.write(";")
f.write(preprocess_content(doc.content))
f.write("\n")
with open("dataset_correspondents.txt", "w") as f:
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
f.write(doc.correspondent.name if doc.correspondent is not None else "None")
f.write(";")
f.write(preprocess_content(doc.content))
f.write("\n")

View File

@@ -1,5 +1,8 @@
import logging
from django.core.management.base import BaseCommand
from documents.classifier import DocumentClassifier
from documents.models import Document, Tag
from ...mixins import Renderable
@@ -8,25 +11,44 @@ from ...mixins import Renderable
class Command(Renderable, BaseCommand):
help = """
Using the current set of tagging rules, apply said rules to all
documents in the database, effectively allowing you to back-tag all
previously indexed documents with tags created (or modified) after
their initial import.
There is no help. #TODO
""".replace(" ", "")
def __init__(self, *args, **kwargs):
self.verbosity = 0
BaseCommand.__init__(self, *args, **kwargs)
def add_arguments(self, parser):
parser.add_argument(
"-c", "--correspondent",
action="store_true"
)
parser.add_argument(
"-T", "--tags",
action="store_true"
)
parser.add_argument(
"-t", "--type",
action="store_true"
)
parser.add_argument(
"-i", "--inbox-only",
action="store_true"
)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
for document in Document.objects.all().exclude(tags__is_archived_tag=True):
if options['inbox_only']:
documents = Document.objects.filter(tags__is_inbox_tag=True).distinct()
else:
documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()
tags = Tag.objects.exclude(
pk__in=document.tags.values_list("pk", flat=True))
logging.getLogger(__name__).info("Loading classifier")
clf = DocumentClassifier.load_classifier()
for tag in Tag.match_all(document.content, tags):
print('Tagging {} with "{}"'.format(document, tag))
document.tags.add(tag)
for document in documents:
logging.getLogger(__name__).info("Processing document {}".format(document.title))
clf.classify_document(document, classify_type=options['type'], classify_tags=options['tags'], classify_correspondent=options['correspondent'])