paperless-ngx/src/documents/classifier.py

import logging
import os
import pickle
import re
import warnings
from collections.abc import Iterator
from datetime import datetime
from hashlib import sha256
from pathlib import Path
from typing import Optional

from django.conf import settings
from sklearn.exceptions import InconsistentVersionWarning

from documents.models import Document
from documents.models import MatchingModel

logger = logging.getLogger("paperless.classifier")


class IncompatibleClassifierVersionError(Exception):
    def __init__(self, message: str, *args: object) -> None:
        self.message = message
        super().__init__(*args)


class ClassifierModelCorruptError(Exception):
    pass


def load_classifier() -> Optional["DocumentClassifier"]:
    if not os.path.isfile(settings.MODEL_FILE):
        logger.debug(
            "Document classification model does not exist (yet), not "
            "performing automatic matching.",
        )
        return None

    classifier = DocumentClassifier()
    try:
        classifier.load()

    except IncompatibleClassifierVersionError as e:
        logger.info(f"Classifier version incompatible: {e.message}, will re-train")
        os.unlink(settings.MODEL_FILE)
        classifier = None
    except ClassifierModelCorruptError:
        # there's something wrong with the model file.
        logger.exception(
            "Unrecoverable error while loading document "
            "classification model, deleting model file.",
        )
        os.unlink(settings.MODEL_FILE)
        classifier = None
    except OSError:
        logger.exception("IO error while loading document classification model")
        classifier = None
    except Exception:  # pragma: no cover
        logger.exception("Unknown error while loading document classification model")
        classifier = None

    return classifier


class DocumentClassifier:
    # v7 - Updated scikit-learn package version
    # v8 - Added storage path classifier
    # v9 - Changed from hashing to time/ids for re-train check
    FORMAT_VERSION = 9

    def __init__(self):
        # last time a document changed and therefore training might be required
        self.last_doc_change_time: Optional[datetime] = None
        # Hash of primary keys of AUTO matching values last used in training
        self.last_auto_type_hash: Optional[bytes] = None

        self.data_vectorizer = None
        self.tags_binarizer = None
        self.tags_classifier = None
        self.correspondent_classifier = None
        self.document_type_classifier = None
        self.storage_path_classifier = None

        self._stemmer = None
        self._stop_words = None

    def load(self) -> None:
        # Catch warnings for processing
        with warnings.catch_warnings(record=True) as w:
            with open(settings.MODEL_FILE, "rb") as f:
                schema_version = pickle.load(f)

                if schema_version != self.FORMAT_VERSION:
                    raise IncompatibleClassifierVersionError(
                        "Cannot load classifier, incompatible versions.",
                    )
                else:
                    try:
                        self.last_doc_change_time = pickle.load(f)
                        self.last_auto_type_hash = pickle.load(f)

                        self.data_vectorizer = pickle.load(f)
                        self.tags_binarizer = pickle.load(f)

                        self.tags_classifier = pickle.load(f)
                        self.correspondent_classifier = pickle.load(f)
                        self.document_type_classifier = pickle.load(f)
                        self.storage_path_classifier = pickle.load(f)
                    except Exception as err:
                        raise ClassifierModelCorruptError from err

            # Check for the warning about unpickling from differing versions
            # and consider it incompatible
            sk_learn_warning_url = (
                "https://scikit-learn.org/stable/"
                "model_persistence.html"
                "#security-maintainability-limitations"
            )
            for warning in w:
                # The warning is inconsistent, the MLPClassifier is a specific warning, others have not updated yet
                if issubclass(warning.category, InconsistentVersionWarning) or (
                    issubclass(warning.category, UserWarning)
                    and sk_learn_warning_url in str(warning.message)
                ):
                    raise IncompatibleClassifierVersionError("sklearn version update")

    def save(self):
        target_file: Path = settings.MODEL_FILE
        target_file_temp = target_file.with_suffix(".pickle.part")

        with open(target_file_temp, "wb") as f:
            pickle.dump(self.FORMAT_VERSION, f)

            pickle.dump(self.last_doc_change_time, f)
            pickle.dump(self.last_auto_type_hash, f)

            pickle.dump(self.data_vectorizer, f)

            pickle.dump(self.tags_binarizer, f)
            pickle.dump(self.tags_classifier, f)

            pickle.dump(self.correspondent_classifier, f)
            pickle.dump(self.document_type_classifier, f)
            pickle.dump(self.storage_path_classifier, f)

        target_file_temp.rename(target_file)

    def train(self):
        # Get non-inbox documents
        docs_queryset = Document.objects.exclude(
            tags__is_inbox_tag=True,
        )

        # No documents exit to train against
        if docs_queryset.count() == 0:
            raise ValueError("No training data available.")

        labels_tags = []
        labels_correspondent = []
        labels_document_type = []
        labels_storage_path = []

        # Step 1: Extract and preprocess training data from the database.
        logger.debug("Gathering data from database...")
        hasher = sha256()
        for doc in docs_queryset:
            y = -1
            dt = doc.document_type
            if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
                y = dt.pk
            hasher.update(y.to_bytes(4, "little", signed=True))
            labels_document_type.append(y)

            y = -1
            cor = doc.correspondent
            if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
                y = cor.pk
            hasher.update(y.to_bytes(4, "little", signed=True))
            labels_correspondent.append(y)

            tags = sorted(
                tag.pk
                for tag in doc.tags.filter(
                    matching_algorithm=MatchingModel.MATCH_AUTO,
                )
            )
            for tag in tags:
                hasher.update(tag.to_bytes(4, "little", signed=True))
            labels_tags.append(tags)

            y = -1
            sp = doc.storage_path
            if sp and sp.matching_algorithm == MatchingModel.MATCH_AUTO:
                y = sp.pk
            hasher.update(y.to_bytes(4, "little", signed=True))
            labels_storage_path.append(y)

        labels_tags_unique = {tag for tags in labels_tags for tag in tags}

        num_tags = len(labels_tags_unique)

        # Check if retraining is actually required.
        # A document has been updated since the classifier was trained
        # New auto tags, types, correspondent, storage paths exist
        latest_doc_change = docs_queryset.latest("modified").modified
        if (
            self.last_doc_change_time is not None
            and self.last_doc_change_time >= latest_doc_change
        ) and self.last_auto_type_hash == hasher.digest():
            logger.info("No updates since last training")
            return False

        # subtract 1 since -1 (null) is also part of the classes.

        # union with {-1} accounts for cases where all documents have
        # correspondents and types assigned, so -1 isn't part of labels_x, which
        # it usually is.
        num_correspondents = len(set(labels_correspondent) | {-1}) - 1
        num_document_types = len(set(labels_document_type) | {-1}) - 1
        num_storage_paths = len(set(labels_storage_path) | {-1}) - 1

        logger.debug(
            "{} documents, {} tag(s), {} correspondent(s), "
            "{} document type(s). {} storage path(es)".format(
                docs_queryset.count(),
                num_tags,
                num_correspondents,
                num_document_types,
                num_storage_paths,
            ),
        )

        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.neural_network import MLPClassifier
        from sklearn.preprocessing import LabelBinarizer
        from sklearn.preprocessing import MultiLabelBinarizer

        # Step 2: vectorize data
        logger.debug("Vectorizing data...")

        def content_generator() -> Iterator[str]:
            """
            Generates the content for documents, but once at a time
            """
            for doc in docs_queryset:
                yield self.preprocess_content(doc.content)

        self.data_vectorizer = CountVectorizer(
            analyzer="word",
            ngram_range=(1, 2),
            min_df=0.01,
        )

        data_vectorized = self.data_vectorizer.fit_transform(content_generator())

        # See the notes here:
        # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
        # This attribute isn't needed to function and can be large
        self.data_vectorizer.stop_words_ = None

        # Step 3: train the classifiers
        if num_tags > 0:
            logger.debug("Training tags classifier...")

            if num_tags == 1:
                # Special case where only one tag has auto:
                # Fallback to binary classification.
                labels_tags = [
                    label[0] if len(label) == 1 else -1 for label in labels_tags
                ]
                self.tags_binarizer = LabelBinarizer()
                labels_tags_vectorized = self.tags_binarizer.fit_transform(
                    labels_tags,
                ).ravel()
            else:
                self.tags_binarizer = MultiLabelBinarizer()
                labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)

            self.tags_classifier = MLPClassifier(tol=0.01)
            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
        else:
            self.tags_classifier = None
            logger.debug("There are no tags. Not training tags classifier.")

        if num_correspondents > 0:
            logger.debug("Training correspondent classifier...")
            self.correspondent_classifier = MLPClassifier(tol=0.01)
            self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
        else:
            self.correspondent_classifier = None
            logger.debug(
                "There are no correspondents. Not training correspondent "
                "classifier.",
            )

        if num_document_types > 0:
            logger.debug("Training document type classifier...")
            self.document_type_classifier = MLPClassifier(tol=0.01)
            self.document_type_classifier.fit(data_vectorized, labels_document_type)
        else:
            self.document_type_classifier = None
            logger.debug(
                "There are no document types. Not training document type "
                "classifier.",
            )

        if num_storage_paths > 0:
            logger.debug(
                "Training storage paths classifier...",
            )
            self.storage_path_classifier = MLPClassifier(tol=0.01)
            self.storage_path_classifier.fit(
                data_vectorized,
                labels_storage_path,
            )
        else:
            self.storage_path_classifier = None
            logger.debug(
                "There are no storage paths. Not training storage path classifier.",
            )

        self.last_doc_change_time = latest_doc_change
        self.last_auto_type_hash = hasher.digest()

        return True

    def preprocess_content(self, content: str) -> str:  # pragma: no cover
        """
        Process to contents of a document, distilling it down into
        words which are meaningful to the content
        """

        # Lower case the document
        content = content.lower().strip()
        # Reduce spaces
        content = re.sub(r"\s+", " ", content)
        # Get only the letters
        content = re.sub(r"[^\w\s]", " ", content)

        # If the NLTK language is supported, do further processing
        if settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED:
            import nltk
            from nltk.corpus import stopwords
            from nltk.stem import SnowballStemmer
            from nltk.tokenize import word_tokenize

            # Not really hacky, since it isn't private and is documented, but
            # set the search path for NLTK data to the single location it should be in
            nltk.data.path = [settings.NLTK_DIR]

            try:
                # Preload the corpus early, to force the lazy loader to transform
                stopwords.ensure_loaded()

                # Do some one time setup
                # Sometimes, somehow, there's multiple threads loading the corpus
                # and it's not thread safe, raising an AttributeError
                if self._stemmer is None:
                    self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
                if self._stop_words is None:
                    self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))

                # Tokenize
                # This splits the content into tokens, roughly words
                words: list[str] = word_tokenize(
                    content,
                    language=settings.NLTK_LANGUAGE,
                )

                meaningful_words = []
                for word in words:
                    # Skip stop words
                    # These are words like "a", "and", "the" which add little meaning
                    if word in self._stop_words:
                        continue
                    # Stem the words
                    # This reduces the words to their stems.
                    # "amazement" returns "amaz"
                    # "amaze" returns "amaz
                    # "amazed" returns "amaz"
                    meaningful_words.append(self._stemmer.stem(word))

                return " ".join(meaningful_words)

            except AttributeError:
                return content

        return content

    def predict_correspondent(self, content: str) -> Optional[int]:
        if self.correspondent_classifier:
            X = self.data_vectorizer.transform([self.preprocess_content(content)])
            correspondent_id = self.correspondent_classifier.predict(X)
            if correspondent_id != -1:
                return correspondent_id
            else:
                return None
        else:
            return None

    def predict_document_type(self, content: str) -> Optional[int]:
        if self.document_type_classifier:
            X = self.data_vectorizer.transform([self.preprocess_content(content)])
            document_type_id = self.document_type_classifier.predict(X)
            if document_type_id != -1:
                return document_type_id
            else:
                return None
        else:
            return None

    def predict_tags(self, content: str) -> list[int]:
        from sklearn.utils.multiclass import type_of_target

        if self.tags_classifier:
            X = self.data_vectorizer.transform([self.preprocess_content(content)])
            y = self.tags_classifier.predict(X)
            tags_ids = self.tags_binarizer.inverse_transform(y)[0]
            if type_of_target(y).startswith("multilabel"):
                # the usual case when there are multiple tags.
                return list(tags_ids)
            elif type_of_target(y) == "binary" and tags_ids != -1:
                # This is for when we have binary classification with only one
                # tag and the result is to assign this tag.
                return [tags_ids]
            else:
                # Usually binary as well with -1 as the result, but we're
                # going to catch everything else here as well.
                return []
        else:
            return []

    def predict_storage_path(self, content: str) -> Optional[int]:
        if self.storage_path_classifier:
            X = self.data_vectorizer.transform([self.preprocess_content(content)])
            storage_path_id = self.storage_path_classifier.predict(X)
            if storage_path_id != -1:
                return storage_path_id
            else:
                return None
        else:
            return None