mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	implemented automatic classification field functionality
This commit is contained in:
		@@ -43,7 +43,7 @@ class DocumentClassifier(object):
 | 
			
		||||
 | 
			
		||||
    def reload(self):
 | 
			
		||||
        if self.classifier_version is None or os.path.getmtime(settings.MODEL_FILE) > self.classifier_version:
 | 
			
		||||
            print("reloading classifier")
 | 
			
		||||
            logging.getLogger(__name__).info("Reloading classifier models")
 | 
			
		||||
            with open(settings.MODEL_FILE, "rb") as f:
 | 
			
		||||
                self.data_vectorizer = pickle.load(f)
 | 
			
		||||
                self.tags_binarizer = pickle.load(f)
 | 
			
		||||
@@ -77,11 +77,14 @@ class DocumentClassifier(object):
 | 
			
		||||
        logging.getLogger(__name__).info("Gathering data from database...")
 | 
			
		||||
        for doc in Document.objects.exclude(tags__is_inbox_tag=True):
 | 
			
		||||
            data.append(preprocess_content(doc.content))
 | 
			
		||||
            labels_type.append(doc.document_type.name if doc.document_type is not None else "-")
 | 
			
		||||
            labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None else "-")
 | 
			
		||||
            tags = [tag.name for tag in doc.tags.all()]
 | 
			
		||||
            labels_type.append(doc.document_type.name if doc.document_type is not None and doc.document_type.automatic_classification else "-")
 | 
			
		||||
            labels_correspondent.append(doc.correspondent.name if doc.correspondent is not None and doc.correspondent.automatic_classification else "-")
 | 
			
		||||
            tags = [tag.name for tag in doc.tags.filter(automatic_classification=True)]
 | 
			
		||||
            labels_tags.append(tags)
 | 
			
		||||
 | 
			
		||||
        labels_tags_unique = set([tag for tags in labels_tags for tag in tags])
 | 
			
		||||
        logging.getLogger(__name__).info("{} documents, {} tag(s) {}, {} correspondent(s) {}, {} type(s) {}.".format(len(data), len(labels_tags_unique), labels_tags_unique, len(set(labels_correspondent)), set(labels_correspondent), len(set(labels_type)), set(labels_type)))
 | 
			
		||||
 | 
			
		||||
        # Step 2: vectorize data
 | 
			
		||||
        logging.getLogger(__name__).info("Vectorizing data...")
 | 
			
		||||
        self.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 6), min_df=0.1)
 | 
			
		||||
 
 | 
			
		||||
@@ -22,7 +22,6 @@ class Command(Renderable, BaseCommand):
 | 
			
		||||
 | 
			
		||||
        clf.train()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...")
 | 
			
		||||
 | 
			
		||||
        clf.save_classifier()
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user