Markus 69ef26dab0
Feature: Dynamic document storage pathes (#916)
* Added devcontainer

* Add feature storage pathes

* Exclude tests and add versioning

* Check escaping

* Check escaping

* Check quoting

* Echo

* Escape

* Escape :

* Double escape \

* Escaping

* Remove if

* Escape colon

* Missing \

* Esacpe :

* Escape all

* test

* Remove sed

* Fix exclude

* Remove SED command

* Add LD_LIBRARY_PATH

* Adjusted to v1.7

* Updated test-cases

* Remove devcontainer

* Removed internal build-file

* Run pre-commit

* Corrected flak8 error

* Adjusted to v1.7

* Updated test-cases

* Corrected flak8 error

* Adjusted to new plural translations

* Small adjustments due to code-review backend

* Adjusted line-break

* Removed PAPERLESS prefix from settings variables

* Corrected style change due to search+replace

* First documentation draft

* Revert changes to Pipfile

* Add sphinx-autobuild with keep-outdated

* Revert merge error that results in wrong storage path is evaluated

* Adjust styles of generated files ...

* Adds additional testing to cover dynamic storage path functionality

* Remove unnecessary condition

* Add hint to edit storage path dialog

* Correct spelling of pathes to paths

* Minor documentation tweaks

* Minor typo

* improving wrapping of filter editor buttons with new storage path button

* Update .gitignore

* Fix select border radius in non input-groups

* Better storage path edit hint

* Add note to edit storage path dialog re document_renamer

* Add note to bulk edit storage path re document_renamer

* Rename FILTER_STORAGE_DIRECTORY to PATH

* Fix broken filter rule parsing

* Show default storage if unspecified

* Remove note re storage path on bulk edit

* Add basic validation of filename variables

Co-authored-by: Markus Kling <markus@markus-kling.net>
Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com>
Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com>
Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 14:42:25 -07:00

189 lines
5.5 KiB
Python

import logging
import re
from documents.models import Correspondent
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import StoragePath
from documents.models import Tag
logger = logging.getLogger("paperless.matching")
def log_reason(matching_model, document, reason):
class_name = type(matching_model).__name__
logger.debug(
f"{class_name} {matching_model.name} matched on document "
f"{document} because {reason}",
)
def match_correspondents(document, classifier):
if classifier:
pred_id = classifier.predict_correspondent(document.content)
else:
pred_id = None
correspondents = Correspondent.objects.all()
return list(
filter(lambda o: matches(o, document) or o.pk == pred_id, correspondents),
)
def match_document_types(document, classifier):
if classifier:
pred_id = classifier.predict_document_type(document.content)
else:
pred_id = None
document_types = DocumentType.objects.all()
return list(
filter(lambda o: matches(o, document) or o.pk == pred_id, document_types),
)
def match_tags(document, classifier):
if classifier:
predicted_tag_ids = classifier.predict_tags(document.content)
else:
predicted_tag_ids = []
tags = Tag.objects.all()
return list(
filter(lambda o: matches(o, document) or o.pk in predicted_tag_ids, tags),
)
def match_storage_paths(document, classifier):
if classifier:
pred_id = classifier.predict_storage_path(document.content)
else:
pred_id = None
storage_paths = StoragePath.objects.all()
return list(
filter(
lambda o: matches(o, document) or o.pk == pred_id,
storage_paths,
),
)
def matches(matching_model, document):
search_kwargs = {}
document_content = document.content
# Check that match is not empty
if matching_model.match.strip() == "":
return False
if matching_model.is_insensitive:
search_kwargs = {"flags": re.IGNORECASE}
if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
for word in _split_match(matching_model):
search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
if not search_result:
return False
log_reason(
matching_model,
document,
f"it contains all of these words: {matching_model.match}",
)
return True
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
for word in _split_match(matching_model):
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
log_reason(matching_model, document, f"it contains this word: {word}")
return True
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
result = bool(
re.search(
rf"\b{re.escape(matching_model.match)}\b",
document_content,
**search_kwargs,
),
)
if result:
log_reason(
matching_model,
document,
f'it contains this string: "{matching_model.match}"',
)
return result
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
try:
match = re.search(
re.compile(matching_model.match, **search_kwargs),
document_content,
)
except re.error:
logger.error(
f"Error while processing regular expression " f"{matching_model.match}",
)
return False
if match:
log_reason(
matching_model,
document,
f"the string {match.group()} matches the regular expression "
f"{matching_model.match}",
)
return bool(match)
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
from fuzzywuzzy import fuzz
match = re.sub(r"[^\w\s]", "", matching_model.match)
text = re.sub(r"[^\w\s]", "", document_content)
if matching_model.is_insensitive:
match = match.lower()
text = text.lower()
if fuzz.partial_ratio(match, text) >= 90:
# TODO: make this better
log_reason(
matching_model,
document,
f"parts of the document content somehow match the string "
f"{matching_model.match}",
)
return True
else:
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
# this is done elsewhere.
return False
else:
raise NotImplementedError("Unsupported matching algorithm")
def _split_match(matching_model):
"""
Splits the match to individual keywords, getting rid of unnecessary
spaces and grouping quoted words together.
Example:
' some random words "with quotes " and spaces'
==>
["some", "random", "words", "with+quotes", "and", "spaces"]
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
for t in findterms(matching_model.match)
]