Feature: Dynamic document storage pathes (#916)

* Added devcontainer

* Add feature storage pathes

* Exclude tests and add versioning

* Check escaping

* Check escaping

* Check quoting

* Echo

* Escape

* Escape :

* Double escape \

* Escaping

* Remove if

* Escape colon

* Missing \

* Esacpe :

* Escape all

* test

* Remove sed

* Fix exclude

* Remove SED command

* Add LD_LIBRARY_PATH

* Adjusted to v1.7

* Updated test-cases

* Remove devcontainer

* Removed internal build-file

* Run pre-commit

* Corrected flak8 error

* Adjusted to v1.7

* Updated test-cases

* Corrected flak8 error

* Adjusted to new plural translations

* Small adjustments due to code-review backend

* Adjusted line-break

* Removed PAPERLESS prefix from settings variables

* Corrected style change due to search+replace

* First documentation draft

* Revert changes to Pipfile

* Add sphinx-autobuild with keep-outdated

* Revert merge error that results in wrong storage path is evaluated

* Adjust styles of generated files ...

* Adds additional testing to cover dynamic storage path functionality

* Remove unnecessary condition

* Add hint to edit storage path dialog

* Correct spelling of pathes to paths

* Minor documentation tweaks

* Minor typo

* improving wrapping of filter editor buttons with new storage path button

* Update .gitignore

* Fix select border radius in non input-groups

* Better storage path edit hint

* Add note to edit storage path dialog re document_renamer

* Add note to bulk edit storage path re document_renamer

* Rename FILTER_STORAGE_DIRECTORY to PATH

* Fix broken filter rule parsing

* Show default storage if unspecified

* Remove note re storage path on bulk edit

* Add basic validation of filename variables

Co-authored-by: Markus Kling <markus@markus-kling.net>
Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com>
Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com>
Co-authored-by: Quinn Casey <quinn@quinncasey.com>
This commit is contained in:
Markus
2022-05-19 23:42:25 +02:00
committed by GitHub
parent c3997c9f26
commit 69ef26dab0
67 changed files with 1427 additions and 203 deletions

View File

@@ -59,8 +59,8 @@ def load_classifier():
class DocumentClassifier:
# v7 - Updated scikit-learn package version
FORMAT_VERSION = 7
# v8 - Added storage path classifier
FORMAT_VERSION = 8
def __init__(self):
# hash of the training data. used to prevent re-training when the
@@ -72,6 +72,7 @@ class DocumentClassifier:
self.tags_classifier = None
self.correspondent_classifier = None
self.document_type_classifier = None
self.storage_path_classifier = None
def load(self):
with open(settings.MODEL_FILE, "rb") as f:
@@ -90,6 +91,7 @@ class DocumentClassifier:
self.tags_classifier = pickle.load(f)
self.correspondent_classifier = pickle.load(f)
self.document_type_classifier = pickle.load(f)
self.storage_path_classifier = pickle.load(f)
except Exception:
raise ClassifierModelCorruptError()
@@ -107,6 +109,7 @@ class DocumentClassifier:
pickle.dump(self.tags_classifier, f)
pickle.dump(self.correspondent_classifier, f)
pickle.dump(self.document_type_classifier, f)
pickle.dump(self.storage_path_classifier, f)
if os.path.isfile(target_file):
os.unlink(target_file)
@@ -118,6 +121,7 @@ class DocumentClassifier:
labels_tags = list()
labels_correspondent = list()
labels_document_type = list()
labels_storage_path = list()
# Step 1: Extract and preprocess training data from the database.
logger.debug("Gathering data from database...")
@@ -153,6 +157,13 @@ class DocumentClassifier:
m.update(tag.to_bytes(4, "little", signed=True))
labels_tags.append(tags)
y = -1
sd = doc.storage_path
if sd and sd.matching_algorithm == MatchingModel.MATCH_AUTO:
y = sd.pk
m.update(y.to_bytes(4, "little", signed=True))
labels_storage_path.append(y)
if not data:
raise ValueError("No training data available.")
@@ -172,14 +183,16 @@ class DocumentClassifier:
# it usually is.
num_correspondents = len(set(labels_correspondent) | {-1}) - 1
num_document_types = len(set(labels_document_type) | {-1}) - 1
num_storage_paths = len(set(labels_storage_path) | {-1}) - 1
logger.debug(
"{} documents, {} tag(s), {} correspondent(s), "
"{} document type(s).".format(
"{} document type(s). {} storage path(es)".format(
len(data),
num_tags,
num_correspondents,
num_document_types,
num_storage_paths,
),
)
@@ -242,6 +255,21 @@ class DocumentClassifier:
"classifier.",
)
if num_storage_paths > 0:
logger.debug(
"Training storage paths classifier...",
)
self.storage_path_classifier = MLPClassifier(tol=0.01)
self.storage_path_classifier.fit(
data_vectorized,
labels_storage_path,
)
else:
self.storage_path_classifier = None
logger.debug(
"There are no storage paths. Not training storage path classifier.",
)
self.data_hash = new_data_hash
return True
@@ -288,3 +316,14 @@ class DocumentClassifier:
return []
else:
return []
def predict_storage_path(self, content):
if self.storage_path_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
storage_path_id = self.storage_path_classifier.predict(X)
if storage_path_id != -1:
return storage_path_id
else:
return None
else:
return None