Feature: Dynamic document storage pathes (#916)

* Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2025-11-25 23:59:09 -06:00 · 2022-05-19 23:42:25 +02:00
parent c3997c9f26
commit 69ef26dab0
67 changed files with 1427 additions and 203 deletions
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -59,8 +59,8 @@ def load_classifier():

 class DocumentClassifier:

-    # v7 - Updated scikit-learn package version
-    FORMAT_VERSION = 7
+    # v8 - Added storage path classifier
+    FORMAT_VERSION = 8

    def __init__(self):
        # hash of the training data. used to prevent re-training when the
@@ -72,6 +72,7 @@ class DocumentClassifier:
        self.tags_classifier = None
        self.correspondent_classifier = None
        self.document_type_classifier = None
+        self.storage_path_classifier = None

    def load(self):
        with open(settings.MODEL_FILE, "rb") as f:
@@ -90,6 +91,7 @@ class DocumentClassifier:
                    self.tags_classifier = pickle.load(f)
                    self.correspondent_classifier = pickle.load(f)
                    self.document_type_classifier = pickle.load(f)
+                    self.storage_path_classifier = pickle.load(f)
                except Exception:
                    raise ClassifierModelCorruptError()

@@ -107,6 +109,7 @@ class DocumentClassifier:
            pickle.dump(self.tags_classifier, f)
            pickle.dump(self.correspondent_classifier, f)
            pickle.dump(self.document_type_classifier, f)
+            pickle.dump(self.storage_path_classifier, f)

        if os.path.isfile(target_file):
            os.unlink(target_file)
@@ -118,6 +121,7 @@ class DocumentClassifier:
        labels_tags = list()
        labels_correspondent = list()
        labels_document_type = list()
+        labels_storage_path = list()

        # Step 1: Extract and preprocess training data from the database.
        logger.debug("Gathering data from database...")
@@ -153,6 +157,13 @@ class DocumentClassifier:
                m.update(tag.to_bytes(4, "little", signed=True))
            labels_tags.append(tags)

+            y = -1
+            sd = doc.storage_path
+            if sd and sd.matching_algorithm == MatchingModel.MATCH_AUTO:
+                y = sd.pk
+            m.update(y.to_bytes(4, "little", signed=True))
+            labels_storage_path.append(y)
+
        if not data:
            raise ValueError("No training data available.")

@@ -172,14 +183,16 @@ class DocumentClassifier:
        # it usually is.
        num_correspondents = len(set(labels_correspondent) | {-1}) - 1
        num_document_types = len(set(labels_document_type) | {-1}) - 1
+        num_storage_paths = len(set(labels_storage_path) | {-1}) - 1

        logger.debug(
            "{} documents, {} tag(s), {} correspondent(s), "
-            "{} document type(s).".format(
+            "{} document type(s). {} storage path(es)".format(
                len(data),
                num_tags,
                num_correspondents,
                num_document_types,
+                num_storage_paths,
            ),
        )

@@ -242,6 +255,21 @@ class DocumentClassifier:
                "classifier.",
            )

+        if num_storage_paths > 0:
+            logger.debug(
+                "Training storage paths classifier...",
+            )
+            self.storage_path_classifier = MLPClassifier(tol=0.01)
+            self.storage_path_classifier.fit(
+                data_vectorized,
+                labels_storage_path,
+            )
+        else:
+            self.storage_path_classifier = None
+            logger.debug(
+                "There are no storage paths. Not training storage path classifier.",
+            )
+
        self.data_hash = new_data_hash

        return True
@@ -288,3 +316,14 @@ class DocumentClassifier:
                return []
        else:
            return []
+
+    def predict_storage_path(self, content):
+        if self.storage_path_classifier:
+            X = self.data_vectorizer.transform([preprocess_content(content)])
+            storage_path_id = self.storage_path_classifier.predict(X)
+            if storage_path_id != -1:
+                return storage_path_id
+            else:
+                return None
+        else:
+            return None