diff --git a/src/documents/admin.py b/src/documents/admin.py index 7c79ebaa0..277158459 100755 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -102,9 +102,8 @@ class CommonAdmin(admin.ModelAdmin): class CorrespondentAdmin(CommonAdmin): - list_display = ("name", "match", "matching_algorithm", "document_count", "last_correspondence") - list_filter = ("matching_algorithm",) - list_editable = ("match", "matching_algorithm") + list_display = ("name", "automatic_classification", "document_count", "last_correspondence") + list_editable = ("automatic_classification",) def get_queryset(self, request): qs = super(CorrespondentAdmin, self).get_queryset(request) @@ -122,10 +121,9 @@ class CorrespondentAdmin(CommonAdmin): class TagAdmin(CommonAdmin): - list_display = ("name", "colour", "match", "matching_algorithm", - "document_count") - list_filter = ("colour", "matching_algorithm") - list_editable = ("colour", "match", "matching_algorithm") + list_display = ("name", "colour", "automatic_classification", "document_count") + list_filter = ("colour",) + list_editable = ("colour", "automatic_classification") def get_queryset(self, request): qs = super(TagAdmin, self).get_queryset(request) @@ -139,9 +137,8 @@ class TagAdmin(CommonAdmin): class DocumentTypeAdmin(CommonAdmin): - list_display = ("name", "match", "matching_algorithm", "document_count") - list_filter = ("matching_algorithm",) - list_editable = ("match", "matching_algorithm") + list_display = ("name", "automatic_classification", "document_count") + list_editable = ("automatic_classification",) def get_queryset(self, request): qs = super(DocumentTypeAdmin, self).get_queryset(request) diff --git a/src/documents/classifier.py b/src/documents/classifier.py index dd8e48f49..d925a73a9 100755 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -1,3 +1,4 @@ +import os import pickle from documents.models import Correspondent, DocumentType, Tag @@ -16,6 +17,18 @@ def preprocess_content(content): class DocumentClassifier(object): + classifier_version = None + + data_vectorizer = None + + tags_binarizer = None + correspondent_binarizer = None + type_binarizer = None + + tags_classifier = None + correspondent_classifier = None + type_classifier = None + @staticmethod def load_classifier(): clf = DocumentClassifier() @@ -23,15 +36,18 @@ class DocumentClassifier(object): return clf def reload(self): - with open(settings.MODEL_FILE, "rb") as f: - self.data_vectorizer = pickle.load(f) - self.tags_binarizer = pickle.load(f) - self.correspondent_binarizer = pickle.load(f) - self.type_binarizer = pickle.load(f) + if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version: + print("reloading classifier") + with open(settings.MODEL_FILE, "rb") as f: + self.data_vectorizer = pickle.load(f) + self.tags_binarizer = pickle.load(f) + self.correspondent_binarizer = pickle.load(f) + self.type_binarizer = pickle.load(f) - self.tags_classifier = pickle.load(f) - self.correspondent_classifier = pickle.load(f) - self.type_classifier = pickle.load(f) + self.tags_classifier = pickle.load(f) + self.correspondent_classifier = pickle.load(f) + self.type_classifier = pickle.load(f) + self.classifier_version = os.path.getmtime(settings.MODEL_FILE) def save_classifier(self): with open(settings.MODEL_FILE, "wb") as f: diff --git a/src/documents/consumer.py b/src/documents/consumer.py old mode 100644 new mode 100755 index 28fc28f9e..927f92e3e --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -221,12 +221,6 @@ class Consumer: storage_type=self.storage_type ) - relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags)) - if relevant_tags: - tag_names = ", ".join([t.slug for t in relevant_tags]) - self.log("debug", "Tagging with {}".format(tag_names)) - document.tags.add(*relevant_tags) - self._write(document, doc, document.source_path) self._write(document, thumbnail, document.thumbnail_path) diff --git a/src/documents/management/commands/document_create_classifier.py b/src/documents/management/commands/document_create_classifier.py index 0549709dd..66aca0d60 100755 --- a/src/documents/management/commands/document_create_classifier.py +++ b/src/documents/management/commands/document_create_classifier.py @@ -42,9 +42,14 @@ class Command(Renderable, BaseCommand): # Step 2: vectorize data logging.getLogger(__name__).info("Vectorizing data...") - clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05) + clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1) data_vectorized = clf.data_vectorizer.fit_transform(data) + print(clf.data_vectorizer.vocabulary_) + + logging.getLogger(__name__).info("Shape of vectorized data: {}".format(data_vectorized.shape)) + + clf.tags_binarizer = MultiLabelBinarizer() labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags) diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index 7367f8057..5366cd193 100755 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -46,7 +46,11 @@ class Command(Renderable, BaseCommand): documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct() logging.getLogger(__name__).info("Loading classifier") - clf = DocumentClassifier.load_classifier() + try: + clf = DocumentClassifier.load_classifier() + except FileNotFoundError: + logging.getLogger(__name__).fatal("Cannot classify documents, classifier model file was not found.") + return for document in documents: diff --git a/src/documents/migrations/0024_auto_20180904_1425.py b/src/documents/migrations/0024_auto_20180904_1425.py new file mode 100755 index 000000000..07191d8cf --- /dev/null +++ b/src/documents/migrations/0024_auto_20180904_1425.py @@ -0,0 +1,77 @@ +# Generated by Django 2.0.8 on 2018-09-04 14:25 + +from django.db import migrations, models + + +def transfer_automatic_classification(apps, schema_editor): + for model_name in ["Tag", "Correspondent", "DocumentType"]: + model_class = apps.get_model("documents", model_name) + for o in model_class.objects.all(): + o.automatic_classification = o.match is not None and len(o.match) > 0 + o.save() + + +def reverse_automatic_classification(apps, schema_editor): + pass + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0023_auto_20180823_1155'), + ] + + operations = [ + migrations.AddField( + model_name='correspondent', + name='automatic_classification', + field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'), + ), + migrations.AddField( + model_name='documenttype', + name='automatic_classification', + field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'), + ), + migrations.AddField( + model_name='tag', + name='automatic_classification', + field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'), + ), + migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification), + migrations.RemoveField( + model_name='correspondent', + name='is_insensitive', + ), + migrations.RemoveField( + model_name='correspondent', + name='match', + ), + migrations.RemoveField( + model_name='correspondent', + name='matching_algorithm', + ), + migrations.RemoveField( + model_name='documenttype', + name='is_insensitive', + ), + migrations.RemoveField( + model_name='documenttype', + name='match', + ), + migrations.RemoveField( + model_name='documenttype', + name='matching_algorithm', + ), + migrations.RemoveField( + model_name='tag', + name='is_insensitive', + ), + migrations.RemoveField( + model_name='tag', + name='match', + ), + migrations.RemoveField( + model_name='tag', + name='matching_algorithm', + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index da30ce4bd..594813b54 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -15,48 +15,15 @@ from django.db import models from django.template.defaultfilters import slugify from django.utils import timezone -from reminders.models import Reminder from .managers import LogManager class MatchingModel(models.Model): - MATCH_ANY = 1 - MATCH_ALL = 2 - MATCH_LITERAL = 3 - MATCH_REGEX = 4 - MATCH_FUZZY = 5 - MATCHING_ALGORITHMS = ( - (MATCH_ANY, "Any"), - (MATCH_ALL, "All"), - (MATCH_LITERAL, "Literal"), - (MATCH_REGEX, "Regular Expression"), - (MATCH_FUZZY, "Fuzzy Match"), - ) - name = models.CharField(max_length=128, unique=True) slug = models.SlugField(blank=True) - match = models.CharField(max_length=256, blank=True) - matching_algorithm = models.PositiveIntegerField( - choices=MATCHING_ALGORITHMS, - default=MATCH_ANY, - help_text=( - "Which algorithm you want to use when matching text to the OCR'd " - "PDF. Here, \"any\" looks for any occurrence of any word " - "provided in the PDF, while \"all\" requires that every word " - "provided appear in the PDF, albeit not in the order provided. A " - "\"literal\" match means that the text you enter must appear in " - "the PDF exactly as you've entered it, and \"regular expression\" " - "uses a regex to match the PDF. (If you don't know what a regex " - "is, you probably don't want this option.) Finally, a \"fuzzy " - "match\" looks for words or phrases that are mostly—but not " - "exactly—the same, which can be useful for matching against " - "documents containg imperfections that foil accurate OCR." - ) - ) - - is_insensitive = models.BooleanField(default=True) + automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.') class Meta: abstract = True @@ -64,87 +31,8 @@ class MatchingModel(models.Model): def __str__(self): return self.name - @property - def conditions(self): - return "{}: \"{}\" ({})".format( - self.name, self.match, self.get_matching_algorithm_display()) - - @classmethod - def match_all(cls, text, tags=None): - - if tags is None: - tags = cls.objects.all() - - text = text.lower() - for tag in tags: - if tag.matches(text): - yield tag - - def matches(self, text): - - search_kwargs = {} - - # Check that match is not empty - if self.match.strip() == "": - return False - - if self.is_insensitive: - search_kwargs = {"flags": re.IGNORECASE} - - if self.matching_algorithm == self.MATCH_ALL: - for word in self._split_match(): - search_result = re.search( - r"\b{}\b".format(word), text, **search_kwargs) - if not search_result: - return False - return True - - if self.matching_algorithm == self.MATCH_ANY: - for word in self._split_match(): - if re.search(r"\b{}\b".format(word), text, **search_kwargs): - return True - return False - - if self.matching_algorithm == self.MATCH_LITERAL: - return bool(re.search( - r"\b{}\b".format(self.match), text, **search_kwargs)) - - if self.matching_algorithm == self.MATCH_REGEX: - return bool(re.search( - re.compile(self.match, **search_kwargs), text)) - - if self.matching_algorithm == self.MATCH_FUZZY: - match = re.sub(r'[^\w\s]', '', self.match) - text = re.sub(r'[^\w\s]', '', text) - if self.is_insensitive: - match = match.lower() - text = text.lower() - - return True if fuzz.partial_ratio(match, text) >= 90 else False - - raise NotImplementedError("Unsupported matching algorithm") - - def _split_match(self): - """ - Splits the match to individual keywords, getting rid of unnecessary - spaces and grouping quoted words together. - - Example: - ' some random words "with quotes " and spaces' - ==> - ["some", "random", "words", "with\s+quotes", "and", "spaces"] - """ - findterms = re.compile(r'"([^"]+)"|(\S+)').findall - normspace = re.compile(r"\s+").sub - return [ - normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+") - for t in findterms(self.match) - ] - def save(self, *args, **kwargs): - self.match = self.match.lower() - if not self.slug: self.slug = slugify(self.name) diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 48c6db952..449e02200 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -16,15 +16,17 @@ def logger(message, group): logging.getLogger(__name__).debug(message, extra={"group": group}) -classifier = None +classifier = DocumentClassifier() def classify_document(sender, document=None, logging_group=None, **kwargs): global classifier - if classifier is None: - classifier = DocumentClassifier.load_classifier() + try: + classifier.reload() + classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True) + except FileNotFoundError: + logging.getLogger(__name__).fatal("Cannot classify document, classifier model file was not found.") - classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)