mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
implemented automatic classification field functionality
This commit is contained in:
parent
82bc0e3368
commit
cea880f245
@ -43,7 +43,7 @@ class DocumentClassifier(object):
|
|||||||
|
|
||||||
def reload(self):
|
def reload(self):
|
||||||
if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
|
if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
|
||||||
print("reloading classifier")
|
logging.getLogger(__name__).info("Reloading classifier models")
|
||||||
with open(settings.MODEL_FILE, "rb") as f:
|
with open(settings.MODEL_FILE, "rb") as f:
|
||||||
self.data_vectorizer = pickle.load(f)
|
self.data_vectorizer = pickle.load(f)
|
||||||
self.tags_binarizer = pickle.load(f)
|
self.tags_binarizer = pickle.load(f)
|
||||||
@ -77,11 +77,14 @@ class DocumentClassifier(object):
|
|||||||
logging.getLogger(__name__).info("Gathering data from database...")
|
logging.getLogger(__name__).info("Gathering data from database...")
|
||||||
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
|
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
|
||||||
data.append(preprocess_content(doc.content))
|
data.append(preprocess_content(doc.content))
|
||||||
labels_type.append(doc.document_type.name if doc.document_type is not None else "-")
|
labels_type.append(doc.document_type.name if doc.document_type is not None and doc.document_type.automatic_classification else "-")
|
||||||
labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None else "-")
|
labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None and doc.correspondent.automatic_classification else "-")
|
||||||
tags = [tag.name for tag in doc.tags.all()]
|
tags = [tag.name for tag in doc.tags.filter(automatic_classification=True)]
|
||||||
labels_tags.append(tags)
|
labels_tags.append(tags)
|
||||||
|
|
||||||
|
labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
|
||||||
|
logging.getLogger(__name__).info("{} documents, {} tag(s) {}, {} correspondent(s) {}, {} type(s) {}.".format(len(data), len(labels_tags_unique), labels_tags_unique, len(set(labels_correspondent)), set(labels_correspondent), len(set(labels_type)), set(labels_type)))
|
||||||
|
|
||||||
# Step 2: vectorize data
|
# Step 2: vectorize data
|
||||||
logging.getLogger(__name__).info("Vectorizing data...")
|
logging.getLogger(__name__).info("Vectorizing data...")
|
||||||
self.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
|
self.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
|
||||||
|
@ -22,7 +22,6 @@ class Command(Renderable, BaseCommand):
|
|||||||
|
|
||||||
clf.train()
|
clf.train()
|
||||||
|
|
||||||
|
|
||||||
logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...")
|
logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...")
|
||||||
|
|
||||||
clf.save_classifier()
|
clf.save_classifier()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user