mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Updates the classifier to catch warnings from scikit-learn and rebuild the model file when this happens
This commit is contained in:
parent
1aeb95396b
commit
77fbbe95ff
@ -4,6 +4,8 @@ import os
|
||||
import pickle
|
||||
import re
|
||||
import shutil
|
||||
import warnings
|
||||
from typing import Optional
|
||||
|
||||
from django.conf import settings
|
||||
from documents.models import Document
|
||||
@ -21,13 +23,13 @@ class ClassifierModelCorruptError(Exception):
|
||||
logger = logging.getLogger("paperless.classifier")
|
||||
|
||||
|
||||
def preprocess_content(content):
|
||||
def preprocess_content(content: str) -> str:
|
||||
content = content.lower().strip()
|
||||
content = re.sub(r"\s+", " ", content)
|
||||
return content
|
||||
|
||||
|
||||
def load_classifier():
|
||||
def load_classifier() -> Optional["DocumentClassifier"]:
|
||||
if not os.path.isfile(settings.MODEL_FILE):
|
||||
logger.debug(
|
||||
"Document classification model does not exist (yet), not "
|
||||
@ -39,7 +41,11 @@ def load_classifier():
|
||||
try:
|
||||
classifier.load()
|
||||
|
||||
except (ClassifierModelCorruptError, IncompatibleClassifierVersionError):
|
||||
except IncompatibleClassifierVersionError:
|
||||
logger.info("Classifier version updated, will re-train")
|
||||
os.unlink(settings.MODEL_FILE)
|
||||
classifier = None
|
||||
except ClassifierModelCorruptError:
|
||||
# there's something wrong with the model file.
|
||||
logger.exception(
|
||||
"Unrecoverable error while loading document "
|
||||
@ -59,13 +65,14 @@ def load_classifier():
|
||||
|
||||
class DocumentClassifier:
|
||||
|
||||
# v7 - Updated scikit-learn package version
|
||||
# v8 - Added storage path classifier
|
||||
FORMAT_VERSION = 8
|
||||
|
||||
def __init__(self):
|
||||
# hash of the training data. used to prevent re-training when the
|
||||
# training data has not changed.
|
||||
self.data_hash = None
|
||||
self.data_hash: Optional[bytes] = None
|
||||
|
||||
self.data_vectorizer = None
|
||||
self.tags_binarizer = None
|
||||
@ -75,25 +82,41 @@ class DocumentClassifier:
|
||||
self.storage_path_classifier = None
|
||||
|
||||
def load(self):
|
||||
with open(settings.MODEL_FILE, "rb") as f:
|
||||
schema_version = pickle.load(f)
|
||||
# Catch warnings for processing
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
with open(settings.MODEL_FILE, "rb") as f:
|
||||
schema_version = pickle.load(f)
|
||||
|
||||
if schema_version != self.FORMAT_VERSION:
|
||||
raise IncompatibleClassifierVersionError(
|
||||
"Cannot load classifier, incompatible versions.",
|
||||
if schema_version != self.FORMAT_VERSION:
|
||||
raise IncompatibleClassifierVersionError(
|
||||
"Cannot load classifier, incompatible versions.",
|
||||
)
|
||||
else:
|
||||
try:
|
||||
self.data_hash = pickle.load(f)
|
||||
self.data_vectorizer = pickle.load(f)
|
||||
self.tags_binarizer = pickle.load(f)
|
||||
|
||||
self.tags_classifier = pickle.load(f)
|
||||
self.correspondent_classifier = pickle.load(f)
|
||||
self.document_type_classifier = pickle.load(f)
|
||||
self.storage_path_classifier = pickle.load(f)
|
||||
except Exception:
|
||||
raise ClassifierModelCorruptError()
|
||||
|
||||
# Check for the warning about unpickling from differing versions
|
||||
# and consider it incompatible
|
||||
if len(w) > 0:
|
||||
sk_learn_warning_url = (
|
||||
"https://scikit-learn.org/stable/"
|
||||
"model_persistence.html"
|
||||
"#security-maintainability-limitations"
|
||||
)
|
||||
else:
|
||||
try:
|
||||
self.data_hash = pickle.load(f)
|
||||
self.data_vectorizer = pickle.load(f)
|
||||
self.tags_binarizer = pickle.load(f)
|
||||
|
||||
self.tags_classifier = pickle.load(f)
|
||||
self.correspondent_classifier = pickle.load(f)
|
||||
self.document_type_classifier = pickle.load(f)
|
||||
self.storage_path_classifier = pickle.load(f)
|
||||
except Exception:
|
||||
raise ClassifierModelCorruptError()
|
||||
for warning in w:
|
||||
if issubclass(warning.category, UserWarning):
|
||||
w_msg = str(warning.message)
|
||||
if sk_learn_warning_url in w_msg:
|
||||
raise IncompatibleClassifierVersionError()
|
||||
|
||||
def save(self):
|
||||
target_file = settings.MODEL_FILE
|
||||
|
Binary file not shown.
BIN
src/documents/tests/data/v1.0.2.model.pickle
Normal file
BIN
src/documents/tests/data/v1.0.2.model.pickle
Normal file
Binary file not shown.
@ -3,10 +3,12 @@ import tempfile
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import documents
|
||||
import pytest
|
||||
from django.conf import settings
|
||||
from django.test import override_settings
|
||||
from django.test import TestCase
|
||||
from documents.classifier import ClassifierModelCorruptError
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.classifier import IncompatibleClassifierVersionError
|
||||
from documents.classifier import load_classifier
|
||||
@ -216,6 +218,45 @@ class TestClassifier(DirectoriesMixin, TestCase):
|
||||
|
||||
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])
|
||||
|
||||
@override_settings(
|
||||
MODEL_FILE=os.path.join(os.path.dirname(__file__), "data", "model.pickle"),
|
||||
)
|
||||
@mock.patch("documents.classifier.pickle.load")
|
||||
def test_load_corrupt_file(self, patched_pickle_load):
|
||||
"""
|
||||
GIVEN:
|
||||
- Corrupted classifier pickle file
|
||||
WHEN:
|
||||
- An attempt is made to load the classifier
|
||||
THEN:
|
||||
- The ClassifierModelCorruptError is raised
|
||||
"""
|
||||
# First load is the schema version
|
||||
patched_pickle_load.side_effect = [DocumentClassifier.FORMAT_VERSION, OSError()]
|
||||
|
||||
with self.assertRaises(ClassifierModelCorruptError):
|
||||
self.classifier.load()
|
||||
|
||||
@override_settings(
|
||||
MODEL_FILE=os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"data",
|
||||
"v1.0.2.model.pickle",
|
||||
),
|
||||
)
|
||||
def test_load_new_scikit_learn_version(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- classifier pickle file created with a different scikit-learn version
|
||||
WHEN:
|
||||
- An attempt is made to load the classifier
|
||||
THEN:
|
||||
- The classifier reports the warning was captured and processed
|
||||
"""
|
||||
|
||||
with self.assertRaises(IncompatibleClassifierVersionError):
|
||||
self.classifier.load()
|
||||
|
||||
def test_one_correspondent_predict(self):
|
||||
c1 = Correspondent.objects.create(
|
||||
name="c1",
|
||||
|
Loading…
x
Reference in New Issue
Block a user