diff --git a/src/documents/classifier.py b/src/documents/classifier.py index b427264c8..3051b82a5 100755 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -5,10 +5,6 @@ import pickle import re from django.conf import settings -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.neural_network import MLPClassifier -from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer -from sklearn.utils.multiclass import type_of_target from documents.models import Document, MatchingModel @@ -109,6 +105,10 @@ class DocumentClassifier(object): pickle.dump(self.document_type_classifier, f) def train(self): + from sklearn.feature_extraction.text import CountVectorizer + from sklearn.neural_network import MLPClassifier + from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer + data = list() labels_tags = list() labels_correspondent = list() @@ -265,6 +265,8 @@ class DocumentClassifier(object): return None def predict_tags(self, content): + from sklearn.utils.multiclass import type_of_target + if self.tags_classifier: X = self.data_vectorizer.transform([preprocess_content(content)]) y = self.tags_classifier.predict(X) diff --git a/src/documents/tests/data/model.pickle b/src/documents/tests/data/model.pickle new file mode 100644 index 000000000..db303ec80 Binary files /dev/null and b/src/documents/tests/data/model.pickle differ diff --git a/src/documents/tests/test_classifier.py b/src/documents/tests/test_classifier.py index 43c38b691..14673ae65 100644 --- a/src/documents/tests/test_classifier.py +++ b/src/documents/tests/test_classifier.py @@ -130,6 +130,15 @@ class TestClassifier(DirectoriesMixin, TestCase): new_classifier.reload() self.assertFalse(new_classifier.train()) + @override_settings(MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle")) + def test_load_and_classify(self): + self.generate_test_data() + + new_classifier = DocumentClassifier() + new_classifier.reload() + + self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12]) + def test_one_correspondent_predict(self): c1 = Correspondent.objects.create(name="c1", matching_algorithm=Correspondent.MATCH_AUTO) doc1 = Document.objects.create(title="doc1", content="this is a document from c1", correspondent=c1, checksum="A") diff --git a/src/paperless/settings.py b/src/paperless/settings.py index b6d01ba53..56e91695e 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -4,7 +4,6 @@ import multiprocessing import os import re -import dateparser from dotenv import load_dotenv from django.utils.translation import gettext_lazy as _ @@ -491,7 +490,11 @@ if PAPERLESS_TIKA_ENABLED: # List dates that should be ignored when trying to parse date from document text IGNORE_DATES = set() -for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): - d = dateparser.parse(s) - if d: - IGNORE_DATES.add(d.date()) + +if os.getenv("PAPERLESS_IGNORE_DATES", ""): + import dateparser + + for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): + d = dateparser.parse(s) + if d: + IGNORE_DATES.add(d.date())