Format Python code with black

2025-11-25 23:59:09 -06:00 · 2022-02-27 15:26:41 +01:00
parent 13885968e3
commit fc695896dd
136 changed files with 6142 additions and 3811 deletions
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -39,8 +39,7 @@ def load_classifier():
    try:
        classifier.load()

-    except (ClassifierModelCorruptError,
-            IncompatibleClassifierVersionError):
+    except (ClassifierModelCorruptError, IncompatibleClassifierVersionError):
        # there's something wrong with the model file.
        logger.exception(
            f"Unrecoverable error while loading document "
@@ -49,14 +48,10 @@ def load_classifier():
        os.unlink(settings.MODEL_FILE)
        classifier = None
    except OSError:
-        logger.exception(
-            f"IO error while loading document classification model"
-        )
+        logger.exception(f"IO error while loading document classification model")
        classifier = None
    except Exception:
-        logger.exception(
-            f"Unknown error while loading document classification model"
-        )
+        logger.exception(f"Unknown error while loading document classification model")
        classifier = None

    return classifier
@@ -83,7 +78,8 @@ class DocumentClassifier(object):

            if schema_version != self.FORMAT_VERSION:
                raise IncompatibleClassifierVersionError(
-                    "Cannor load classifier, incompatible versions.")
+                    "Cannor load classifier, incompatible versions."
+                )
            else:
                try:
                    self.data_hash = pickle.load(f)
@@ -125,30 +121,37 @@ class DocumentClassifier(object):
        # Step 1: Extract and preprocess training data from the database.
        logger.debug("Gathering data from database...")
        m = hashlib.sha1()
-        for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):  # NOQA: E501
+        for doc in Document.objects.order_by("pk").exclude(
+            tags__is_inbox_tag=True
+        ):  # NOQA: E501
            preprocessed_content = preprocess_content(doc.content)
-            m.update(preprocessed_content.encode('utf-8'))
+            m.update(preprocessed_content.encode("utf-8"))
            data.append(preprocessed_content)

            y = -1
            dt = doc.document_type
            if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
                y = dt.pk
-            m.update(y.to_bytes(4, 'little', signed=True))
+            m.update(y.to_bytes(4, "little", signed=True))
            labels_document_type.append(y)

            y = -1
            cor = doc.correspondent
            if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
                y = cor.pk
-            m.update(y.to_bytes(4, 'little', signed=True))
+            m.update(y.to_bytes(4, "little", signed=True))
            labels_correspondent.append(y)

-            tags = sorted([tag.pk for tag in doc.tags.filter(
-                matching_algorithm=MatchingModel.MATCH_AUTO
-            )])
+            tags = sorted(
+                [
+                    tag.pk
+                    for tag in doc.tags.filter(
+                        matching_algorithm=MatchingModel.MATCH_AUTO
+                    )
+                ]
+            )
            for tag in tags:
-                m.update(tag.to_bytes(4, 'little', signed=True))
+                m.update(tag.to_bytes(4, "little", signed=True))
            labels_tags.append(tags)

        if not data:
@@ -174,10 +177,7 @@ class DocumentClassifier(object):
        logger.debug(
            "{} documents, {} tag(s), {} correspondent(s), "
            "{} document type(s).".format(
-                len(data),
-                num_tags,
-                num_correspondents,
-                num_document_types
+                len(data), num_tags, num_correspondents, num_document_types
            )
        )

@@ -188,9 +188,7 @@ class DocumentClassifier(object):
        # Step 2: vectorize data
        logger.debug("Vectorizing data...")
        self.data_vectorizer = CountVectorizer(
-            analyzer="word",
-            ngram_range=(1, 2),
-            min_df=0.01
+            analyzer="word", ngram_range=(1, 2), min_df=0.01
        )
        data_vectorized = self.data_vectorizer.fit_transform(data)

@@ -201,54 +199,41 @@ class DocumentClassifier(object):
            if num_tags == 1:
                # Special case where only one tag has auto:
                # Fallback to binary classification.
-                labels_tags = [label[0] if len(label) == 1 else -1
-                               for label in labels_tags]
+                labels_tags = [
+                    label[0] if len(label) == 1 else -1 for label in labels_tags
+                ]
                self.tags_binarizer = LabelBinarizer()
                labels_tags_vectorized = self.tags_binarizer.fit_transform(
-                    labels_tags).ravel()
+                    labels_tags
+                ).ravel()
            else:
                self.tags_binarizer = MultiLabelBinarizer()
-                labels_tags_vectorized = self.tags_binarizer.fit_transform(
-                    labels_tags)
+                labels_tags_vectorized = self.tags_binarizer.fit_transform(labels_tags)

            self.tags_classifier = MLPClassifier(tol=0.01)
            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
        else:
            self.tags_classifier = None
-            logger.debug(
-                "There are no tags. Not training tags classifier."
-            )
+            logger.debug("There are no tags. Not training tags classifier.")

        if num_correspondents > 0:
-            logger.debug(
-                "Training correspondent classifier..."
-            )
+            logger.debug("Training correspondent classifier...")
            self.correspondent_classifier = MLPClassifier(tol=0.01)
-            self.correspondent_classifier.fit(
-                data_vectorized,
-                labels_correspondent
-            )
+            self.correspondent_classifier.fit(data_vectorized, labels_correspondent)
        else:
            self.correspondent_classifier = None
            logger.debug(
-                "There are no correspondents. Not training correspondent "
-                "classifier."
+                "There are no correspondents. Not training correspondent " "classifier."
            )

        if num_document_types > 0:
-            logger.debug(
-                "Training document type classifier..."
-            )
+            logger.debug("Training document type classifier...")
            self.document_type_classifier = MLPClassifier(tol=0.01)
-            self.document_type_classifier.fit(
-                data_vectorized,
-                labels_document_type
-            )
+            self.document_type_classifier.fit(data_vectorized, labels_document_type)
        else:
            self.document_type_classifier = None
            logger.debug(
-                "There are no document types. Not training document type "
-                "classifier."
+                "There are no document types. Not training document type " "classifier."
            )

        self.data_hash = new_data_hash
@@ -284,10 +269,10 @@ class DocumentClassifier(object):
            X = self.data_vectorizer.transform([preprocess_content(content)])
            y = self.tags_classifier.predict(X)
            tags_ids = self.tags_binarizer.inverse_transform(y)[0]
-            if type_of_target(y).startswith('multilabel'):
+            if type_of_target(y).startswith("multilabel"):
                # the usual case when there are multiple tags.
                return list(tags_ids)
-            elif type_of_target(y) == 'binary' and tags_ids != -1:
+            elif type_of_target(y) == "binary" and tags_ids != -1:
                # This is for when we have binary classification with only one
                # tag and the result is to assign this tag.
                return [tags_ids]