mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-03 01:56:16 +00:00
Merge matching
This commit is contained in:
@@ -1,435 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from fnmatch import fnmatch
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.permissions import get_objects_for_user_owner_aware
|
||||
from paperless.models import Correspondent
|
||||
from paperless.models import Document
|
||||
from paperless.models import DocumentType
|
||||
from paperless.models import MatchingModel
|
||||
from paperless.models import StoragePath
|
||||
from paperless.models import Tag
|
||||
from paperless.models import Workflow
|
||||
from paperless.models import WorkflowTrigger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from documents.classifier import DocumentClassifier
|
||||
|
||||
logger = logging.getLogger("paperless.matching")
|
||||
|
||||
|
||||
def log_reason(
|
||||
matching_model: MatchingModel | WorkflowTrigger,
|
||||
document: Document,
|
||||
reason: str,
|
||||
):
|
||||
class_name = type(matching_model).__name__
|
||||
name = (
|
||||
matching_model.name if hasattr(matching_model, "name") else str(matching_model)
|
||||
)
|
||||
logger.debug(
|
||||
f"{class_name} {name} matched on document {document} because {reason}",
|
||||
)
|
||||
|
||||
|
||||
def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = classifier.predict_correspondent(document.content) if classifier else None
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
user = document.owner
|
||||
|
||||
if user is not None:
|
||||
correspondents = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
"documents.view_correspondent",
|
||||
Correspondent,
|
||||
)
|
||||
else:
|
||||
correspondents = Correspondent.objects.all()
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
correspondents,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = classifier.predict_document_type(document.content) if classifier else None
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
user = document.owner
|
||||
|
||||
if user is not None:
|
||||
document_types = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
"documents.view_documenttype",
|
||||
DocumentType,
|
||||
)
|
||||
else:
|
||||
document_types = DocumentType.objects.all()
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
document_types,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_tags(document: Document, classifier: DocumentClassifier, user=None):
|
||||
predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
user = document.owner
|
||||
|
||||
if user is not None:
|
||||
tags = get_objects_for_user_owner_aware(user, "documents.view_tag", Tag)
|
||||
else:
|
||||
tags = Tag.objects.all()
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (
|
||||
o.matching_algorithm == MatchingModel.MATCH_AUTO
|
||||
and o.pk in predicted_tag_ids
|
||||
),
|
||||
tags,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = classifier.predict_storage_path(document.content) if classifier else None
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
user = document.owner
|
||||
|
||||
if user is not None:
|
||||
storage_paths = get_objects_for_user_owner_aware(
|
||||
user,
|
||||
"documents.view_storagepath",
|
||||
StoragePath,
|
||||
)
|
||||
else:
|
||||
storage_paths = StoragePath.objects.all()
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
storage_paths,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def matches(matching_model: MatchingModel, document: Document):
|
||||
search_kwargs = {}
|
||||
|
||||
document_content = document.content
|
||||
|
||||
# Check that match is not empty
|
||||
if not matching_model.match.strip():
|
||||
return False
|
||||
|
||||
if matching_model.is_insensitive:
|
||||
search_kwargs = {"flags": re.IGNORECASE}
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_NONE:
|
||||
return False
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
|
||||
for word in _split_match(matching_model):
|
||||
search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
|
||||
if not search_result:
|
||||
return False
|
||||
log_reason(
|
||||
matching_model,
|
||||
document,
|
||||
f"it contains all of these words: {matching_model.match}",
|
||||
)
|
||||
return True
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||
for word in _split_match(matching_model):
|
||||
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
|
||||
log_reason(matching_model, document, f"it contains this word: {word}")
|
||||
return True
|
||||
return False
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||
result = bool(
|
||||
re.search(
|
||||
rf"\b{re.escape(matching_model.match)}\b",
|
||||
document_content,
|
||||
**search_kwargs,
|
||||
),
|
||||
)
|
||||
if result:
|
||||
log_reason(
|
||||
matching_model,
|
||||
document,
|
||||
f'it contains this string: "{matching_model.match}"',
|
||||
)
|
||||
return result
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
try:
|
||||
match = re.search(
|
||||
re.compile(matching_model.match, **search_kwargs),
|
||||
document_content,
|
||||
)
|
||||
except re.error:
|
||||
logger.error(
|
||||
f"Error while processing regular expression {matching_model.match}",
|
||||
)
|
||||
return False
|
||||
if match:
|
||||
log_reason(
|
||||
matching_model,
|
||||
document,
|
||||
f"the string {match.group()} matches the regular expression "
|
||||
f"{matching_model.match}",
|
||||
)
|
||||
return bool(match)
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
match = re.sub(r"[^\w\s]", "", matching_model.match)
|
||||
text = re.sub(r"[^\w\s]", "", document_content)
|
||||
if matching_model.is_insensitive:
|
||||
match = match.lower()
|
||||
text = text.lower()
|
||||
if fuzz.partial_ratio(match, text, score_cutoff=90):
|
||||
# TODO: make this better
|
||||
log_reason(
|
||||
matching_model,
|
||||
document,
|
||||
f"parts of the document content somehow match the string "
|
||||
f"{matching_model.match}",
|
||||
)
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
# this is done elsewhere.
|
||||
return False
|
||||
|
||||
else:
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
|
||||
def _split_match(matching_model):
|
||||
"""
|
||||
Splits the match to individual keywords, getting rid of unnecessary
|
||||
spaces and grouping quoted words together.
|
||||
|
||||
Example:
|
||||
' some random words "with quotes " and spaces'
|
||||
==>
|
||||
["some", "random", "words", "with+quotes", "and", "spaces"]
|
||||
"""
|
||||
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
||||
normspace = re.compile(r"\s+").sub
|
||||
return [
|
||||
# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
|
||||
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
|
||||
for t in findterms(matching_model.match)
|
||||
]
|
||||
|
||||
|
||||
def consumable_document_matches_workflow(
|
||||
document: ConsumableDocument,
|
||||
trigger: WorkflowTrigger,
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Returns True if the ConsumableDocument matches all filters from the workflow trigger,
|
||||
False otherwise. Includes a reason if doesn't match
|
||||
"""
|
||||
|
||||
trigger_matched = True
|
||||
reason = ""
|
||||
|
||||
# Document source vs trigger source
|
||||
if len(trigger.sources) > 0 and document.source not in [
|
||||
int(x) for x in list(trigger.sources)
|
||||
]:
|
||||
reason = (
|
||||
f"Document source {document.source.name} not in"
|
||||
f" {[DocumentSource(int(x)).name for x in trigger.sources]}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document mail rule vs trigger mail rule
|
||||
if (
|
||||
trigger.filter_mailrule is not None
|
||||
and document.mailrule_id != trigger.filter_mailrule.pk
|
||||
):
|
||||
reason = (
|
||||
f"Document mail rule {document.mailrule_id}"
|
||||
f" != {trigger.filter_mailrule.pk}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document filename vs trigger filename
|
||||
if (
|
||||
trigger.filter_filename is not None
|
||||
and len(trigger.filter_filename) > 0
|
||||
and not fnmatch(
|
||||
document.original_file.name.lower(),
|
||||
trigger.filter_filename.lower(),
|
||||
)
|
||||
):
|
||||
reason = (
|
||||
f"Document filename {document.original_file.name} does not match"
|
||||
f" {trigger.filter_filename.lower()}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document path vs trigger path
|
||||
if (
|
||||
trigger.filter_path is not None
|
||||
and len(trigger.filter_path) > 0
|
||||
and not fnmatch(
|
||||
document.original_file,
|
||||
trigger.filter_path,
|
||||
)
|
||||
):
|
||||
reason = (
|
||||
f"Document path {document.original_file}"
|
||||
f" does not match {trigger.filter_path}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
return (trigger_matched, reason)
|
||||
|
||||
|
||||
def existing_document_matches_workflow(
|
||||
document: Document,
|
||||
trigger: WorkflowTrigger,
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Returns True if the Document matches all filters from the workflow trigger,
|
||||
False otherwise. Includes a reason if doesn't match
|
||||
"""
|
||||
|
||||
trigger_matched = True
|
||||
reason = ""
|
||||
|
||||
if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches(
|
||||
trigger,
|
||||
document,
|
||||
):
|
||||
reason = (
|
||||
f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document tags vs trigger has_tags
|
||||
if (
|
||||
trigger.filter_has_tags.all().count() > 0
|
||||
and document.tags.filter(
|
||||
id__in=trigger.filter_has_tags.all().values_list("id"),
|
||||
).count()
|
||||
== 0
|
||||
):
|
||||
reason = (
|
||||
f"Document tags {document.tags.all()} do not include"
|
||||
f" {trigger.filter_has_tags.all()}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document correspondent vs trigger has_correspondent
|
||||
if (
|
||||
trigger.filter_has_correspondent is not None
|
||||
and document.correspondent != trigger.filter_has_correspondent
|
||||
):
|
||||
reason = (
|
||||
f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document document_type vs trigger has_document_type
|
||||
if (
|
||||
trigger.filter_has_document_type is not None
|
||||
and document.document_type != trigger.filter_has_document_type
|
||||
):
|
||||
reason = (
|
||||
f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
# Document original_filename vs trigger filename
|
||||
if (
|
||||
trigger.filter_filename is not None
|
||||
and len(trigger.filter_filename) > 0
|
||||
and document.original_filename is not None
|
||||
and not fnmatch(
|
||||
document.original_filename.lower(),
|
||||
trigger.filter_filename.lower(),
|
||||
)
|
||||
):
|
||||
reason = (
|
||||
f"Document filename {document.original_filename} does not match"
|
||||
f" {trigger.filter_filename.lower()}",
|
||||
)
|
||||
trigger_matched = False
|
||||
|
||||
return (trigger_matched, reason)
|
||||
|
||||
|
||||
def document_matches_workflow(
|
||||
document: ConsumableDocument | Document,
|
||||
workflow: Workflow,
|
||||
trigger_type: WorkflowTrigger.WorkflowTriggerType,
|
||||
) -> bool:
|
||||
"""
|
||||
Returns True if the ConsumableDocument or Document matches all filters and
|
||||
settings from the workflow trigger, False otherwise
|
||||
"""
|
||||
|
||||
trigger_matched = True
|
||||
if workflow.triggers.filter(type=trigger_type).count() == 0:
|
||||
trigger_matched = False
|
||||
logger.info(f"Document did not match {workflow}")
|
||||
logger.debug(f"No matching triggers with type {trigger_type} found")
|
||||
else:
|
||||
for trigger in workflow.triggers.filter(type=trigger_type):
|
||||
if trigger_type == WorkflowTrigger.WorkflowTriggerType.CONSUMPTION:
|
||||
trigger_matched, reason = consumable_document_matches_workflow(
|
||||
document,
|
||||
trigger,
|
||||
)
|
||||
elif (
|
||||
trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.DOCUMENT_UPDATED
|
||||
or trigger_type == WorkflowTrigger.WorkflowTriggerType.SCHEDULED
|
||||
):
|
||||
trigger_matched, reason = existing_document_matches_workflow(
|
||||
document,
|
||||
trigger,
|
||||
)
|
||||
else:
|
||||
# New trigger types need to be explicitly checked above
|
||||
raise Exception(f"Trigger type {trigger_type} not yet supported")
|
||||
|
||||
if trigger_matched:
|
||||
logger.info(f"Document matched {trigger} from {workflow}")
|
||||
# matched, bail early
|
||||
return True
|
||||
else:
|
||||
logger.info(f"Document did not match {workflow}")
|
||||
logger.debug(reason)
|
||||
|
||||
return trigger_matched
|
@@ -23,7 +23,6 @@ from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from guardian.shortcuts import remove_perm
|
||||
|
||||
from documents import matching
|
||||
from documents.caching import clear_document_caches
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import delete_empty_directories
|
||||
@@ -32,6 +31,7 @@ from documents.mail import send_email
|
||||
from documents.permissions import get_objects_for_user_owner_aware
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from paperless import matching
|
||||
from paperless.models import Correspondent
|
||||
from paperless.models import CustomField
|
||||
from paperless.models import CustomFieldInstance
|
||||
|
@@ -8,8 +8,8 @@ from django.contrib.auth.models import User
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents import matching
|
||||
from documents.signals import document_consumption_finished
|
||||
from paperless import matching
|
||||
from paperless.models import Correspondent
|
||||
from paperless.models import Document
|
||||
from paperless.models import DocumentType
|
||||
|
@@ -24,12 +24,12 @@ if TYPE_CHECKING:
|
||||
from documents import tasks
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.matching import document_matches_workflow
|
||||
from documents.signals import document_consumption_finished
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import DummyProgressManager
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from documents.tests.utils import SampleDirMixin
|
||||
from paperless.matching import document_matches_workflow
|
||||
from paperless.models import Correspondent
|
||||
from paperless.models import CustomField
|
||||
from paperless.models import CustomFieldInstance
|
||||
|
Reference in New Issue
Block a user