mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
implemented automatic classification field functionality
This commit is contained in:
parent
82bc0e3368
commit
cea880f245
@ -43,7 +43,7 @@ class DocumentClassifier(object):
|
||||
|
||||
def reload(self):
|
||||
if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
|
||||
print("reloading classifier")
|
||||
logging.getLogger(__name__).info("Reloading classifier models")
|
||||
with open(settings.MODEL_FILE, "rb") as f:
|
||||
self.data_vectorizer = pickle.load(f)
|
||||
self.tags_binarizer = pickle.load(f)
|
||||
@ -77,11 +77,14 @@ class DocumentClassifier(object):
|
||||
logging.getLogger(__name__).info("Gathering data from database...")
|
||||
for doc in Document.objects.exclude(tags__is_inbox_tag=True):
|
||||
data.append(preprocess_content(doc.content))
|
||||
labels_type.append(doc.document_type.name if doc.document_type is not None else "-")
|
||||
labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None else "-")
|
||||
tags = [tag.name for tag in doc.tags.all()]
|
||||
labels_type.append(doc.document_type.name if doc.document_type is not None and doc.document_type.automatic_classification else "-")
|
||||
labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None and doc.correspondent.automatic_classification else "-")
|
||||
tags = [tag.name for tag in doc.tags.filter(automatic_classification=True)]
|
||||
labels_tags.append(tags)
|
||||
|
||||
labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
|
||||
logging.getLogger(__name__).info("{} documents, {} tag(s) {}, {} correspondent(s) {}, {} type(s) {}.".format(len(data), len(labels_tags_unique), labels_tags_unique, len(set(labels_correspondent)), set(labels_correspondent), len(set(labels_type)), set(labels_type)))
|
||||
|
||||
# Step 2: vectorize data
|
||||
logging.getLogger(__name__).info("Vectorizing data...")
|
||||
self.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
|
||||
|
@ -22,7 +22,6 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
clf.train()
|
||||
|
||||
|
||||
logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...")
|
||||
|
||||
clf.save_classifier()
|
||||
|
Loading…
x
Reference in New Issue
Block a user