mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
removed matching model fields, automatic classifier reloading, added autmatic_classification field to matching model
This commit is contained in:
parent
30134034e2
commit
70bd05450a
@ -102,9 +102,8 @@ class CommonAdmin(admin.ModelAdmin):
|
|||||||
|
|
||||||
class CorrespondentAdmin(CommonAdmin):
|
class CorrespondentAdmin(CommonAdmin):
|
||||||
|
|
||||||
list_display = ("name", "match", "matching_algorithm", "document_count", "last_correspondence")
|
list_display = ("name", "automatic_classification", "document_count", "last_correspondence")
|
||||||
list_filter = ("matching_algorithm",)
|
list_editable = ("automatic_classification",)
|
||||||
list_editable = ("match", "matching_algorithm")
|
|
||||||
|
|
||||||
def get_queryset(self, request):
|
def get_queryset(self, request):
|
||||||
qs = super(CorrespondentAdmin, self).get_queryset(request)
|
qs = super(CorrespondentAdmin, self).get_queryset(request)
|
||||||
@ -122,10 +121,9 @@ class CorrespondentAdmin(CommonAdmin):
|
|||||||
|
|
||||||
class TagAdmin(CommonAdmin):
|
class TagAdmin(CommonAdmin):
|
||||||
|
|
||||||
list_display = ("name", "colour", "match", "matching_algorithm",
|
list_display = ("name", "colour", "automatic_classification", "document_count")
|
||||||
"document_count")
|
list_filter = ("colour",)
|
||||||
list_filter = ("colour", "matching_algorithm")
|
list_editable = ("colour", "automatic_classification")
|
||||||
list_editable = ("colour", "match", "matching_algorithm")
|
|
||||||
|
|
||||||
def get_queryset(self, request):
|
def get_queryset(self, request):
|
||||||
qs = super(TagAdmin, self).get_queryset(request)
|
qs = super(TagAdmin, self).get_queryset(request)
|
||||||
@ -139,9 +137,8 @@ class TagAdmin(CommonAdmin):
|
|||||||
|
|
||||||
class DocumentTypeAdmin(CommonAdmin):
|
class DocumentTypeAdmin(CommonAdmin):
|
||||||
|
|
||||||
list_display = ("name", "match", "matching_algorithm", "document_count")
|
list_display = ("name", "automatic_classification", "document_count")
|
||||||
list_filter = ("matching_algorithm",)
|
list_editable = ("automatic_classification",)
|
||||||
list_editable = ("match", "matching_algorithm")
|
|
||||||
|
|
||||||
def get_queryset(self, request):
|
def get_queryset(self, request):
|
||||||
qs = super(DocumentTypeAdmin, self).get_queryset(request)
|
qs = super(DocumentTypeAdmin, self).get_queryset(request)
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
from documents.models import Correspondent, DocumentType, Tag
|
from documents.models import Correspondent, DocumentType, Tag
|
||||||
@ -16,6 +17,18 @@ def preprocess_content(content):
|
|||||||
|
|
||||||
class DocumentClassifier(object):
|
class DocumentClassifier(object):
|
||||||
|
|
||||||
|
classifier_version = None
|
||||||
|
|
||||||
|
data_vectorizer = None
|
||||||
|
|
||||||
|
tags_binarizer = None
|
||||||
|
correspondent_binarizer = None
|
||||||
|
type_binarizer = None
|
||||||
|
|
||||||
|
tags_classifier = None
|
||||||
|
correspondent_classifier = None
|
||||||
|
type_classifier = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_classifier():
|
def load_classifier():
|
||||||
clf = DocumentClassifier()
|
clf = DocumentClassifier()
|
||||||
@ -23,15 +36,18 @@ class DocumentClassifier(object):
|
|||||||
return clf
|
return clf
|
||||||
|
|
||||||
def reload(self):
|
def reload(self):
|
||||||
with open(settings.MODEL_FILE, "rb") as f:
|
if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
|
||||||
self.data_vectorizer = pickle.load(f)
|
print("reloading classifier")
|
||||||
self.tags_binarizer = pickle.load(f)
|
with open(settings.MODEL_FILE, "rb") as f:
|
||||||
self.correspondent_binarizer = pickle.load(f)
|
self.data_vectorizer = pickle.load(f)
|
||||||
self.type_binarizer = pickle.load(f)
|
self.tags_binarizer = pickle.load(f)
|
||||||
|
self.correspondent_binarizer = pickle.load(f)
|
||||||
|
self.type_binarizer = pickle.load(f)
|
||||||
|
|
||||||
self.tags_classifier = pickle.load(f)
|
self.tags_classifier = pickle.load(f)
|
||||||
self.correspondent_classifier = pickle.load(f)
|
self.correspondent_classifier = pickle.load(f)
|
||||||
self.type_classifier = pickle.load(f)
|
self.type_classifier = pickle.load(f)
|
||||||
|
self.classifier_version = os.path.getmtime(settings.MODEL_FILE)
|
||||||
|
|
||||||
def save_classifier(self):
|
def save_classifier(self):
|
||||||
with open(settings.MODEL_FILE, "wb") as f:
|
with open(settings.MODEL_FILE, "wb") as f:
|
||||||
|
6
src/documents/consumer.py
Normal file → Executable file
6
src/documents/consumer.py
Normal file → Executable file
@ -221,12 +221,6 @@ class Consumer:
|
|||||||
storage_type=self.storage_type
|
storage_type=self.storage_type
|
||||||
)
|
)
|
||||||
|
|
||||||
relevant_tags = set(list(Tag.match_all(text)) + list(file_info.tags))
|
|
||||||
if relevant_tags:
|
|
||||||
tag_names = ", ".join([t.slug for t in relevant_tags])
|
|
||||||
self.log("debug", "Tagging with {}".format(tag_names))
|
|
||||||
document.tags.add(*relevant_tags)
|
|
||||||
|
|
||||||
self._write(document, doc, document.source_path)
|
self._write(document, doc, document.source_path)
|
||||||
self._write(document, thumbnail, document.thumbnail_path)
|
self._write(document, thumbnail, document.thumbnail_path)
|
||||||
|
|
||||||
|
@ -42,9 +42,14 @@ class Command(Renderable, BaseCommand):
|
|||||||
|
|
||||||
# Step 2: vectorize data
|
# Step 2: vectorize data
|
||||||
logging.getLogger(__name__).info("Vectorizing data...")
|
logging.getLogger(__name__).info("Vectorizing data...")
|
||||||
clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), min_df=0.05)
|
clf.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
|
||||||
data_vectorized = clf.data_vectorizer.fit_transform(data)
|
data_vectorized = clf.data_vectorizer.fit_transform(data)
|
||||||
|
|
||||||
|
print(clf.data_vectorizer.vocabulary_)
|
||||||
|
|
||||||
|
logging.getLogger(__name__).info("Shape of vectorized data: {}".format(data_vectorized.shape))
|
||||||
|
|
||||||
|
|
||||||
clf.tags_binarizer = MultiLabelBinarizer()
|
clf.tags_binarizer = MultiLabelBinarizer()
|
||||||
labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)
|
labels_tags_vectorized = clf.tags_binarizer.fit_transform(labels_tags)
|
||||||
|
|
||||||
|
@ -46,7 +46,11 @@ class Command(Renderable, BaseCommand):
|
|||||||
documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()
|
documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct()
|
||||||
|
|
||||||
logging.getLogger(__name__).info("Loading classifier")
|
logging.getLogger(__name__).info("Loading classifier")
|
||||||
clf = DocumentClassifier.load_classifier()
|
try:
|
||||||
|
clf = DocumentClassifier.load_classifier()
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.getLogger(__name__).fatal("Cannot classify documents, classifier model file was not found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
for document in documents:
|
for document in documents:
|
||||||
|
77
src/documents/migrations/0024_auto_20180904_1425.py
Executable file
77
src/documents/migrations/0024_auto_20180904_1425.py
Executable file
@ -0,0 +1,77 @@
|
|||||||
|
# Generated by Django 2.0.8 on 2018-09-04 14:25
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
def transfer_automatic_classification(apps, schema_editor):
|
||||||
|
for model_name in ["Tag", "Correspondent", "DocumentType"]:
|
||||||
|
model_class = apps.get_model("documents", model_name)
|
||||||
|
for o in model_class.objects.all():
|
||||||
|
o.automatic_classification = o.match is not None and len(o.match) > 0
|
||||||
|
o.save()
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_automatic_classification(apps, schema_editor):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0023_auto_20180823_1155'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='correspondent',
|
||||||
|
name='automatic_classification',
|
||||||
|
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='documenttype',
|
||||||
|
name='automatic_classification',
|
||||||
|
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='tag',
|
||||||
|
name='automatic_classification',
|
||||||
|
field=models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.'),
|
||||||
|
),
|
||||||
|
migrations.RunPython(transfer_automatic_classification, reverse_automatic_classification),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='correspondent',
|
||||||
|
name='is_insensitive',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='correspondent',
|
||||||
|
name='match',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='correspondent',
|
||||||
|
name='matching_algorithm',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='documenttype',
|
||||||
|
name='is_insensitive',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='documenttype',
|
||||||
|
name='match',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='documenttype',
|
||||||
|
name='matching_algorithm',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='tag',
|
||||||
|
name='is_insensitive',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='tag',
|
||||||
|
name='match',
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='tag',
|
||||||
|
name='matching_algorithm',
|
||||||
|
),
|
||||||
|
]
|
@ -15,48 +15,15 @@ from django.db import models
|
|||||||
from django.template.defaultfilters import slugify
|
from django.template.defaultfilters import slugify
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from reminders.models import Reminder
|
|
||||||
from .managers import LogManager
|
from .managers import LogManager
|
||||||
|
|
||||||
|
|
||||||
class MatchingModel(models.Model):
|
class MatchingModel(models.Model):
|
||||||
|
|
||||||
MATCH_ANY = 1
|
|
||||||
MATCH_ALL = 2
|
|
||||||
MATCH_LITERAL = 3
|
|
||||||
MATCH_REGEX = 4
|
|
||||||
MATCH_FUZZY = 5
|
|
||||||
MATCHING_ALGORITHMS = (
|
|
||||||
(MATCH_ANY, "Any"),
|
|
||||||
(MATCH_ALL, "All"),
|
|
||||||
(MATCH_LITERAL, "Literal"),
|
|
||||||
(MATCH_REGEX, "Regular Expression"),
|
|
||||||
(MATCH_FUZZY, "Fuzzy Match"),
|
|
||||||
)
|
|
||||||
|
|
||||||
name = models.CharField(max_length=128, unique=True)
|
name = models.CharField(max_length=128, unique=True)
|
||||||
slug = models.SlugField(blank=True)
|
slug = models.SlugField(blank=True)
|
||||||
|
|
||||||
match = models.CharField(max_length=256, blank=True)
|
automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.')
|
||||||
matching_algorithm = models.PositiveIntegerField(
|
|
||||||
choices=MATCHING_ALGORITHMS,
|
|
||||||
default=MATCH_ANY,
|
|
||||||
help_text=(
|
|
||||||
"Which algorithm you want to use when matching text to the OCR'd "
|
|
||||||
"PDF. Here, \"any\" looks for any occurrence of any word "
|
|
||||||
"provided in the PDF, while \"all\" requires that every word "
|
|
||||||
"provided appear in the PDF, albeit not in the order provided. A "
|
|
||||||
"\"literal\" match means that the text you enter must appear in "
|
|
||||||
"the PDF exactly as you've entered it, and \"regular expression\" "
|
|
||||||
"uses a regex to match the PDF. (If you don't know what a regex "
|
|
||||||
"is, you probably don't want this option.) Finally, a \"fuzzy "
|
|
||||||
"match\" looks for words or phrases that are mostly—but not "
|
|
||||||
"exactly—the same, which can be useful for matching against "
|
|
||||||
"documents containg imperfections that foil accurate OCR."
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
is_insensitive = models.BooleanField(default=True)
|
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
abstract = True
|
abstract = True
|
||||||
@ -64,87 +31,8 @@ class MatchingModel(models.Model):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.name
|
return self.name
|
||||||
|
|
||||||
@property
|
|
||||||
def conditions(self):
|
|
||||||
return "{}: \"{}\" ({})".format(
|
|
||||||
self.name, self.match, self.get_matching_algorithm_display())
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def match_all(cls, text, tags=None):
|
|
||||||
|
|
||||||
if tags is None:
|
|
||||||
tags = cls.objects.all()
|
|
||||||
|
|
||||||
text = text.lower()
|
|
||||||
for tag in tags:
|
|
||||||
if tag.matches(text):
|
|
||||||
yield tag
|
|
||||||
|
|
||||||
def matches(self, text):
|
|
||||||
|
|
||||||
search_kwargs = {}
|
|
||||||
|
|
||||||
# Check that match is not empty
|
|
||||||
if self.match.strip() == "":
|
|
||||||
return False
|
|
||||||
|
|
||||||
if self.is_insensitive:
|
|
||||||
search_kwargs = {"flags": re.IGNORECASE}
|
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_ALL:
|
|
||||||
for word in self._split_match():
|
|
||||||
search_result = re.search(
|
|
||||||
r"\b{}\b".format(word), text, **search_kwargs)
|
|
||||||
if not search_result:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_ANY:
|
|
||||||
for word in self._split_match():
|
|
||||||
if re.search(r"\b{}\b".format(word), text, **search_kwargs):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_LITERAL:
|
|
||||||
return bool(re.search(
|
|
||||||
r"\b{}\b".format(self.match), text, **search_kwargs))
|
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_REGEX:
|
|
||||||
return bool(re.search(
|
|
||||||
re.compile(self.match, **search_kwargs), text))
|
|
||||||
|
|
||||||
if self.matching_algorithm == self.MATCH_FUZZY:
|
|
||||||
match = re.sub(r'[^\w\s]', '', self.match)
|
|
||||||
text = re.sub(r'[^\w\s]', '', text)
|
|
||||||
if self.is_insensitive:
|
|
||||||
match = match.lower()
|
|
||||||
text = text.lower()
|
|
||||||
|
|
||||||
return True if fuzz.partial_ratio(match, text) >= 90 else False
|
|
||||||
|
|
||||||
raise NotImplementedError("Unsupported matching algorithm")
|
|
||||||
|
|
||||||
def _split_match(self):
|
|
||||||
"""
|
|
||||||
Splits the match to individual keywords, getting rid of unnecessary
|
|
||||||
spaces and grouping quoted words together.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
' some random words "with quotes " and spaces'
|
|
||||||
==>
|
|
||||||
["some", "random", "words", "with\s+quotes", "and", "spaces"]
|
|
||||||
"""
|
|
||||||
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
|
||||||
normspace = re.compile(r"\s+").sub
|
|
||||||
return [
|
|
||||||
normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
|
|
||||||
for t in findterms(self.match)
|
|
||||||
]
|
|
||||||
|
|
||||||
def save(self, *args, **kwargs):
|
def save(self, *args, **kwargs):
|
||||||
|
|
||||||
self.match = self.match.lower()
|
|
||||||
|
|
||||||
if not self.slug:
|
if not self.slug:
|
||||||
self.slug = slugify(self.name)
|
self.slug = slugify(self.name)
|
||||||
|
|
||||||
|
@ -16,15 +16,17 @@ def logger(message, group):
|
|||||||
logging.getLogger(__name__).debug(message, extra={"group": group})
|
logging.getLogger(__name__).debug(message, extra={"group": group})
|
||||||
|
|
||||||
|
|
||||||
classifier = None
|
classifier = DocumentClassifier()
|
||||||
|
|
||||||
|
|
||||||
def classify_document(sender, document=None, logging_group=None, **kwargs):
|
def classify_document(sender, document=None, logging_group=None, **kwargs):
|
||||||
global classifier
|
global classifier
|
||||||
if classifier is None:
|
try:
|
||||||
classifier = DocumentClassifier.load_classifier()
|
classifier.reload()
|
||||||
|
classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.getLogger(__name__).fatal("Cannot classify document, classifier model file was not found.")
|
||||||
|
|
||||||
classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user