mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-16 00:36:22 +00:00
Feature: Dynamic document storage pathes (#916)
* Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
This commit is contained in:
@@ -59,8 +59,8 @@ def load_classifier():
|
||||
|
||||
class DocumentClassifier:
|
||||
|
||||
# v7 - Updated scikit-learn package version
|
||||
FORMAT_VERSION = 7
|
||||
# v8 - Added storage path classifier
|
||||
FORMAT_VERSION = 8
|
||||
|
||||
def __init__(self):
|
||||
# hash of the training data. used to prevent re-training when the
|
||||
@@ -72,6 +72,7 @@ class DocumentClassifier:
|
||||
self.tags_classifier = None
|
||||
self.correspondent_classifier = None
|
||||
self.document_type_classifier = None
|
||||
self.storage_path_classifier = None
|
||||
|
||||
def load(self):
|
||||
with open(settings.MODEL_FILE, "rb") as f:
|
||||
@@ -90,6 +91,7 @@ class DocumentClassifier:
|
||||
self.tags_classifier = pickle.load(f)
|
||||
self.correspondent_classifier = pickle.load(f)
|
||||
self.document_type_classifier = pickle.load(f)
|
||||
self.storage_path_classifier = pickle.load(f)
|
||||
except Exception:
|
||||
raise ClassifierModelCorruptError()
|
||||
|
||||
@@ -107,6 +109,7 @@ class DocumentClassifier:
|
||||
pickle.dump(self.tags_classifier, f)
|
||||
pickle.dump(self.correspondent_classifier, f)
|
||||
pickle.dump(self.document_type_classifier, f)
|
||||
pickle.dump(self.storage_path_classifier, f)
|
||||
|
||||
if os.path.isfile(target_file):
|
||||
os.unlink(target_file)
|
||||
@@ -118,6 +121,7 @@ class DocumentClassifier:
|
||||
labels_tags = list()
|
||||
labels_correspondent = list()
|
||||
labels_document_type = list()
|
||||
labels_storage_path = list()
|
||||
|
||||
# Step 1: Extract and preprocess training data from the database.
|
||||
logger.debug("Gathering data from database...")
|
||||
@@ -153,6 +157,13 @@ class DocumentClassifier:
|
||||
m.update(tag.to_bytes(4, "little", signed=True))
|
||||
labels_tags.append(tags)
|
||||
|
||||
y = -1
|
||||
sd = doc.storage_path
|
||||
if sd and sd.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = sd.pk
|
||||
m.update(y.to_bytes(4, "little", signed=True))
|
||||
labels_storage_path.append(y)
|
||||
|
||||
if not data:
|
||||
raise ValueError("No training data available.")
|
||||
|
||||
@@ -172,14 +183,16 @@ class DocumentClassifier:
|
||||
# it usually is.
|
||||
num_correspondents = len(set(labels_correspondent) | {-1}) - 1
|
||||
num_document_types = len(set(labels_document_type) | {-1}) - 1
|
||||
num_storage_paths = len(set(labels_storage_path) | {-1}) - 1
|
||||
|
||||
logger.debug(
|
||||
"{} documents, {} tag(s), {} correspondent(s), "
|
||||
"{} document type(s).".format(
|
||||
"{} document type(s). {} storage path(es)".format(
|
||||
len(data),
|
||||
num_tags,
|
||||
num_correspondents,
|
||||
num_document_types,
|
||||
num_storage_paths,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -242,6 +255,21 @@ class DocumentClassifier:
|
||||
"classifier.",
|
||||
)
|
||||
|
||||
if num_storage_paths > 0:
|
||||
logger.debug(
|
||||
"Training storage paths classifier...",
|
||||
)
|
||||
self.storage_path_classifier = MLPClassifier(tol=0.01)
|
||||
self.storage_path_classifier.fit(
|
||||
data_vectorized,
|
||||
labels_storage_path,
|
||||
)
|
||||
else:
|
||||
self.storage_path_classifier = None
|
||||
logger.debug(
|
||||
"There are no storage paths. Not training storage path classifier.",
|
||||
)
|
||||
|
||||
self.data_hash = new_data_hash
|
||||
|
||||
return True
|
||||
@@ -288,3 +316,14 @@ class DocumentClassifier:
|
||||
return []
|
||||
else:
|
||||
return []
|
||||
|
||||
def predict_storage_path(self, content):
|
||||
if self.storage_path_classifier:
|
||||
X = self.data_vectorizer.transform([preprocess_content(content)])
|
||||
storage_path_id = self.storage_path_classifier.predict(X)
|
||||
if storage_path_id != -1:
|
||||
return storage_path_id
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
Reference in New Issue
Block a user