mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-10-12 02:26:09 -05:00
Change how the workflow matching is handled, with the goal to reduce queries, make queries cheaper and reduce function complexity
This commit is contained in:
@@ -345,157 +345,147 @@ def consumable_document_matches_workflow(
|
|||||||
def existing_document_matches_workflow(
|
def existing_document_matches_workflow(
|
||||||
document: Document,
|
document: Document,
|
||||||
trigger: WorkflowTrigger,
|
trigger: WorkflowTrigger,
|
||||||
) -> tuple[bool, str]:
|
) -> tuple[bool, str | None]:
|
||||||
"""
|
"""
|
||||||
Returns True if the Document matches all filters from the workflow trigger,
|
Returns True if the Document matches all filters from the workflow trigger,
|
||||||
False otherwise. Includes a reason if doesn't match
|
False otherwise. Includes a reason if doesn't match
|
||||||
"""
|
"""
|
||||||
|
|
||||||
trigger_matched = True
|
# Check content matching algorithm
|
||||||
reason = ""
|
|
||||||
|
|
||||||
if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches(
|
if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches(
|
||||||
trigger,
|
trigger,
|
||||||
document,
|
document,
|
||||||
):
|
):
|
||||||
reason = (
|
return (
|
||||||
|
False,
|
||||||
f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match",
|
f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match",
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
# Check if any tag filters exist to determine if we need to load document tags
|
||||||
|
trigger_has_tags_qs = trigger.filter_has_tags.all()
|
||||||
|
trigger_has_all_tags_qs = trigger.filter_has_all_tags.all()
|
||||||
|
trigger_has_not_tags_qs = trigger.filter_has_not_tags.all()
|
||||||
|
|
||||||
|
has_tags_filter = trigger_has_tags_qs.exists()
|
||||||
|
has_all_tags_filter = trigger_has_all_tags_qs.exists()
|
||||||
|
has_not_tags_filter = trigger_has_not_tags_qs.exists()
|
||||||
|
|
||||||
|
# Load document tags once if any tag filters exist
|
||||||
|
document_tag_ids = None
|
||||||
|
if has_tags_filter or has_all_tags_filter or has_not_tags_filter:
|
||||||
|
document_tag_ids = set(document.tags.values_list("id", flat=True))
|
||||||
|
|
||||||
# Document tags vs trigger has_tags (any of)
|
# Document tags vs trigger has_tags (any of)
|
||||||
if trigger.filter_has_tags.all().count() > 0 and (
|
if has_tags_filter:
|
||||||
document.tags.filter(
|
trigger_has_tag_ids = set(trigger_has_tags_qs.values_list("id", flat=True))
|
||||||
id__in=trigger.filter_has_tags.all().values_list("id"),
|
if not (document_tag_ids & trigger_has_tag_ids):
|
||||||
).count()
|
# For error message, load the actual tag objects
|
||||||
== 0
|
return (
|
||||||
):
|
False,
|
||||||
reason = (
|
f"Document tags {list(document.tags.all())} do not include {list(trigger_has_tags_qs)}",
|
||||||
f"Document tags {document.tags.all()} do not include"
|
|
||||||
f" {trigger.filter_has_tags.all()}",
|
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
# Document tags vs trigger has_all_tags (all of)
|
# Document tags vs trigger has_all_tags (all of)
|
||||||
if trigger.filter_has_all_tags.all().count() > 0 and trigger_matched:
|
if has_all_tags_filter:
|
||||||
required_tag_ids = set(
|
required_tag_ids = set(trigger_has_all_tags_qs.values_list("id", flat=True))
|
||||||
trigger.filter_has_all_tags.all().values_list("id", flat=True),
|
if not required_tag_ids.issubset(document_tag_ids):
|
||||||
|
return (
|
||||||
|
False,
|
||||||
|
f"Document tags {list(document.tags.all())} do not contain all of {list(trigger_has_all_tags_qs)}",
|
||||||
)
|
)
|
||||||
document_tag_ids = set(
|
|
||||||
document.tags.all().values_list("id", flat=True),
|
|
||||||
)
|
|
||||||
missing_tags = required_tag_ids - document_tag_ids
|
|
||||||
if missing_tags:
|
|
||||||
reason = (
|
|
||||||
f"Document tags {document.tags.all()} do not contain all of"
|
|
||||||
f" {trigger.filter_has_all_tags.all()}",
|
|
||||||
)
|
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
# Document tags vs trigger has_not_tags (none of)
|
# Document tags vs trigger has_not_tags (none of)
|
||||||
if (
|
if has_not_tags_filter:
|
||||||
trigger.filter_has_not_tags.all().count() > 0
|
excluded_tag_ids = set(trigger_has_not_tags_qs.values_list("id", flat=True))
|
||||||
and trigger_matched
|
if document_tag_ids & excluded_tag_ids:
|
||||||
and document.tags.filter(
|
return (
|
||||||
id__in=trigger.filter_has_not_tags.all().values_list("id"),
|
False,
|
||||||
).exists()
|
f"Document tags {list(document.tags.all())} include excluded tags {list(trigger_has_not_tags_qs)}",
|
||||||
):
|
|
||||||
reason = (
|
|
||||||
f"Document tags {document.tags.all()} include excluded tags"
|
|
||||||
f" {trigger.filter_has_not_tags.all()}",
|
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
# Document correspondent vs trigger has_correspondent
|
# Document correspondent vs trigger has_correspondent
|
||||||
if trigger_matched:
|
|
||||||
if (
|
if (
|
||||||
trigger.filter_has_correspondent is not None
|
trigger.filter_has_correspondent_id is not None
|
||||||
and document.correspondent != trigger.filter_has_correspondent
|
and document.correspondent_id != trigger.filter_has_correspondent_id
|
||||||
):
|
):
|
||||||
reason = (
|
return (
|
||||||
|
False,
|
||||||
f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
|
f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
trigger.filter_has_not_correspondents.all().count() > 0
|
document.correspondent_id
|
||||||
and document.correspondent
|
|
||||||
and trigger.filter_has_not_correspondents.filter(
|
and trigger.filter_has_not_correspondents.filter(
|
||||||
id=document.correspondent_id,
|
id=document.correspondent_id,
|
||||||
).exists()
|
).exists()
|
||||||
):
|
):
|
||||||
reason = (
|
return (
|
||||||
f"Document correspondent {document.correspondent} is excluded by"
|
False,
|
||||||
f" {trigger.filter_has_not_correspondents.all()}",
|
f"Document correspondent {document.correspondent} is excluded by {list(trigger.filter_has_not_correspondents.all())}",
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
# Document document_type vs trigger has_document_type
|
# Document document_type vs trigger has_document_type
|
||||||
if trigger_matched:
|
|
||||||
if (
|
if (
|
||||||
trigger.filter_has_document_type is not None
|
trigger.filter_has_document_type_id is not None
|
||||||
and document.document_type != trigger.filter_has_document_type
|
and document.document_type_id != trigger.filter_has_document_type_id
|
||||||
):
|
):
|
||||||
reason = (
|
return (
|
||||||
|
False,
|
||||||
f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
|
f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
trigger.filter_has_not_document_types.all().count() > 0
|
document.document_type_id
|
||||||
and document.document_type
|
|
||||||
and trigger.filter_has_not_document_types.filter(
|
and trigger.filter_has_not_document_types.filter(
|
||||||
id=document.document_type_id,
|
id=document.document_type_id,
|
||||||
).exists()
|
).exists()
|
||||||
):
|
):
|
||||||
reason = (
|
return (
|
||||||
f"Document doc type {document.document_type} is excluded by"
|
False,
|
||||||
f" {trigger.filter_has_not_document_types.all()}",
|
f"Document doc type {document.document_type} is excluded by {list(trigger.filter_has_not_document_types.all())}",
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
# Document storage_path vs trigger has_storage_path
|
# Document storage_path vs trigger has_storage_path
|
||||||
if trigger_matched:
|
|
||||||
if (
|
if (
|
||||||
trigger.filter_has_storage_path is not None
|
trigger.filter_has_storage_path_id is not None
|
||||||
and document.storage_path != trigger.filter_has_storage_path
|
and document.storage_path_id != trigger.filter_has_storage_path_id
|
||||||
):
|
):
|
||||||
reason = (
|
return (
|
||||||
|
False,
|
||||||
f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
|
f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
trigger.filter_has_not_storage_paths.all().count() > 0
|
document.storage_path_id
|
||||||
and document.storage_path
|
|
||||||
and trigger.filter_has_not_storage_paths.filter(
|
and trigger.filter_has_not_storage_paths.filter(
|
||||||
id=document.storage_path_id,
|
id=document.storage_path_id,
|
||||||
).exists()
|
).exists()
|
||||||
):
|
):
|
||||||
reason = (
|
return (
|
||||||
f"Document storage path {document.storage_path} is excluded by"
|
False,
|
||||||
f" {trigger.filter_has_not_storage_paths.all()}",
|
f"Document storage path {document.storage_path} is excluded by {list(trigger.filter_has_not_storage_paths.all())}",
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
if trigger_matched and trigger.filter_custom_field_query:
|
# Custom field query check
|
||||||
|
if trigger.filter_custom_field_query:
|
||||||
parser = CustomFieldQueryParser("filter_custom_field_query")
|
parser = CustomFieldQueryParser("filter_custom_field_query")
|
||||||
try:
|
try:
|
||||||
custom_field_q, annotations = parser.parse(
|
custom_field_q, annotations = parser.parse(
|
||||||
trigger.filter_custom_field_query,
|
trigger.filter_custom_field_query,
|
||||||
)
|
)
|
||||||
except serializers.ValidationError:
|
except serializers.ValidationError:
|
||||||
reason = "Invalid custom field query configuration"
|
return (False, "Invalid custom field query configuration")
|
||||||
trigger_matched = False
|
|
||||||
else:
|
|
||||||
qs = (
|
qs = (
|
||||||
Document.objects.filter(id=document.id)
|
Document.objects.filter(id=document.id)
|
||||||
.annotate(**annotations)
|
.annotate(**annotations)
|
||||||
.filter(custom_field_q)
|
.filter(custom_field_q)
|
||||||
)
|
)
|
||||||
if not qs.exists():
|
if not qs.exists():
|
||||||
reason = "Document custom fields do not match the configured custom field query"
|
return (
|
||||||
trigger_matched = False
|
False,
|
||||||
|
"Document custom fields do not match the configured custom field query",
|
||||||
|
)
|
||||||
|
|
||||||
# Document original_filename vs trigger filename
|
# Document original_filename vs trigger filename
|
||||||
if (
|
if (
|
||||||
@@ -507,13 +497,12 @@ def existing_document_matches_workflow(
|
|||||||
trigger.filter_filename.lower(),
|
trigger.filter_filename.lower(),
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
reason = (
|
return (
|
||||||
f"Document filename {document.original_filename} does not match"
|
False,
|
||||||
f" {trigger.filter_filename.lower()}",
|
f"Document filename {document.original_filename} does not match {trigger.filter_filename.lower()}",
|
||||||
)
|
)
|
||||||
trigger_matched = False
|
|
||||||
|
|
||||||
return (trigger_matched, reason)
|
return (True, None)
|
||||||
|
|
||||||
|
|
||||||
def prefilter_documents_by_workflowtrigger(
|
def prefilter_documents_by_workflowtrigger(
|
||||||
@@ -526,27 +515,28 @@ def prefilter_documents_by_workflowtrigger(
|
|||||||
document_matches_workflow in run_workflows
|
document_matches_workflow in run_workflows
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if trigger.filter_has_tags.all().count() > 0:
|
# Filter for documents that have AT LEAST ONE of the specified tags.
|
||||||
documents = documents.filter(
|
if trigger.filter_has_tags.exists():
|
||||||
tags__in=trigger.filter_has_tags.all(),
|
documents = documents.filter(tags__in=trigger.filter_has_tags.all()).distinct()
|
||||||
).distinct()
|
|
||||||
|
|
||||||
if trigger.filter_has_all_tags.all().count() > 0:
|
# Filter for documents that have ALL of the specified tags.
|
||||||
for tag_id in trigger.filter_has_all_tags.all().values_list("id", flat=True):
|
if trigger.filter_has_all_tags.exists():
|
||||||
documents = documents.filter(tags__id=tag_id)
|
for tag in trigger.filter_has_all_tags.all():
|
||||||
|
documents = documents.filter(tags=tag)
|
||||||
|
# Multiple JOINs can create duplicate results.
|
||||||
documents = documents.distinct()
|
documents = documents.distinct()
|
||||||
|
|
||||||
if trigger.filter_has_not_tags.all().count() > 0:
|
# Exclude documents that have ANY of the specified tags.
|
||||||
documents = documents.exclude(
|
if trigger.filter_has_not_tags.exists():
|
||||||
tags__in=trigger.filter_has_not_tags.all(),
|
documents = documents.exclude(tags__in=trigger.filter_has_not_tags.all())
|
||||||
).distinct()
|
|
||||||
|
# Correspondent, DocumentType, etc. filtering
|
||||||
|
|
||||||
if trigger.filter_has_correspondent is not None:
|
if trigger.filter_has_correspondent is not None:
|
||||||
documents = documents.filter(
|
documents = documents.filter(
|
||||||
correspondent=trigger.filter_has_correspondent,
|
correspondent=trigger.filter_has_correspondent,
|
||||||
)
|
)
|
||||||
|
if trigger.filter_has_not_correspondents.exists():
|
||||||
if trigger.filter_has_not_correspondents.all().count() > 0:
|
|
||||||
documents = documents.exclude(
|
documents = documents.exclude(
|
||||||
correspondent__in=trigger.filter_has_not_correspondents.all(),
|
correspondent__in=trigger.filter_has_not_correspondents.all(),
|
||||||
)
|
)
|
||||||
@@ -555,8 +545,7 @@ def prefilter_documents_by_workflowtrigger(
|
|||||||
documents = documents.filter(
|
documents = documents.filter(
|
||||||
document_type=trigger.filter_has_document_type,
|
document_type=trigger.filter_has_document_type,
|
||||||
)
|
)
|
||||||
|
if trigger.filter_has_not_document_types.exists():
|
||||||
if trigger.filter_has_not_document_types.all().count() > 0:
|
|
||||||
documents = documents.exclude(
|
documents = documents.exclude(
|
||||||
document_type__in=trigger.filter_has_not_document_types.all(),
|
document_type__in=trigger.filter_has_not_document_types.all(),
|
||||||
)
|
)
|
||||||
@@ -565,12 +554,13 @@ def prefilter_documents_by_workflowtrigger(
|
|||||||
documents = documents.filter(
|
documents = documents.filter(
|
||||||
storage_path=trigger.filter_has_storage_path,
|
storage_path=trigger.filter_has_storage_path,
|
||||||
)
|
)
|
||||||
|
if trigger.filter_has_not_storage_paths.exists():
|
||||||
if trigger.filter_has_not_storage_paths.all().count() > 0:
|
|
||||||
documents = documents.exclude(
|
documents = documents.exclude(
|
||||||
storage_path__in=trigger.filter_has_not_storage_paths.all(),
|
storage_path__in=trigger.filter_has_not_storage_paths.all(),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Custom Field & Filename Filtering
|
||||||
|
|
||||||
if trigger.filter_custom_field_query:
|
if trigger.filter_custom_field_query:
|
||||||
parser = CustomFieldQueryParser("filter_custom_field_query")
|
parser = CustomFieldQueryParser("filter_custom_field_query")
|
||||||
try:
|
try:
|
||||||
@@ -582,11 +572,9 @@ def prefilter_documents_by_workflowtrigger(
|
|||||||
|
|
||||||
documents = documents.annotate(**annotations).filter(custom_field_q)
|
documents = documents.annotate(**annotations).filter(custom_field_q)
|
||||||
|
|
||||||
if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
|
if trigger.filter_filename:
|
||||||
# the true fnmatch will actually run later so we just want a loose filter here
|
|
||||||
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
|
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
|
||||||
regex = f"(?i){regex}"
|
documents = documents.filter(original_filename__iregex=regex)
|
||||||
documents = documents.filter(original_filename__regex=regex)
|
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
@@ -1083,7 +1083,7 @@ class TestWorkflows(
|
|||||||
)
|
)
|
||||||
expected_str = f"Document did not match {w}"
|
expected_str = f"Document did not match {w}"
|
||||||
self.assertIn(expected_str, cm.output[0])
|
self.assertIn(expected_str, cm.output[0])
|
||||||
expected_str = f"Document tags {doc.tags.all()} do not include {trigger.filter_has_tags.all()}"
|
expected_str = f"Document tags {list(doc.tags.all())} do not include {list(trigger.filter_has_tags.all())}"
|
||||||
self.assertIn(expected_str, cm.output[1])
|
self.assertIn(expected_str, cm.output[1])
|
||||||
|
|
||||||
def test_document_added_no_match_all_tags(self):
|
def test_document_added_no_match_all_tags(self):
|
||||||
@@ -1119,8 +1119,8 @@ class TestWorkflows(
|
|||||||
expected_str = f"Document did not match {w}"
|
expected_str = f"Document did not match {w}"
|
||||||
self.assertIn(expected_str, cm.output[0])
|
self.assertIn(expected_str, cm.output[0])
|
||||||
expected_str = (
|
expected_str = (
|
||||||
f"Document tags {doc.tags.all()} do not contain all of"
|
f"Document tags {list(doc.tags.all())} do not contain all of"
|
||||||
f" {trigger.filter_has_all_tags.all()}"
|
f" {list(trigger.filter_has_all_tags.all())}"
|
||||||
)
|
)
|
||||||
self.assertIn(expected_str, cm.output[1])
|
self.assertIn(expected_str, cm.output[1])
|
||||||
|
|
||||||
@@ -1157,8 +1157,8 @@ class TestWorkflows(
|
|||||||
expected_str = f"Document did not match {w}"
|
expected_str = f"Document did not match {w}"
|
||||||
self.assertIn(expected_str, cm.output[0])
|
self.assertIn(expected_str, cm.output[0])
|
||||||
expected_str = (
|
expected_str = (
|
||||||
f"Document tags {doc.tags.all()} include excluded tags"
|
f"Document tags {list(doc.tags.all())} include excluded tags"
|
||||||
f" {trigger.filter_has_not_tags.all()}"
|
f" {list(trigger.filter_has_not_tags.all())}"
|
||||||
)
|
)
|
||||||
self.assertIn(expected_str, cm.output[1])
|
self.assertIn(expected_str, cm.output[1])
|
||||||
|
|
||||||
@@ -1194,7 +1194,7 @@ class TestWorkflows(
|
|||||||
self.assertIn(expected_str, cm.output[0])
|
self.assertIn(expected_str, cm.output[0])
|
||||||
expected_str = (
|
expected_str = (
|
||||||
f"Document correspondent {doc.correspondent} is excluded by"
|
f"Document correspondent {doc.correspondent} is excluded by"
|
||||||
f" {trigger.filter_has_not_correspondents.all()}"
|
f" {list(trigger.filter_has_not_correspondents.all())}"
|
||||||
)
|
)
|
||||||
self.assertIn(expected_str, cm.output[1])
|
self.assertIn(expected_str, cm.output[1])
|
||||||
|
|
||||||
@@ -1230,7 +1230,7 @@ class TestWorkflows(
|
|||||||
self.assertIn(expected_str, cm.output[0])
|
self.assertIn(expected_str, cm.output[0])
|
||||||
expected_str = (
|
expected_str = (
|
||||||
f"Document doc type {doc.document_type} is excluded by"
|
f"Document doc type {doc.document_type} is excluded by"
|
||||||
f" {trigger.filter_has_not_document_types.all()}"
|
f" {list(trigger.filter_has_not_document_types.all())}"
|
||||||
)
|
)
|
||||||
self.assertIn(expected_str, cm.output[1])
|
self.assertIn(expected_str, cm.output[1])
|
||||||
|
|
||||||
@@ -1266,7 +1266,7 @@ class TestWorkflows(
|
|||||||
self.assertIn(expected_str, cm.output[0])
|
self.assertIn(expected_str, cm.output[0])
|
||||||
expected_str = (
|
expected_str = (
|
||||||
f"Document storage path {doc.storage_path} is excluded by"
|
f"Document storage path {doc.storage_path} is excluded by"
|
||||||
f" {trigger.filter_has_not_storage_paths.all()}"
|
f" {list(trigger.filter_has_not_storage_paths.all())}"
|
||||||
)
|
)
|
||||||
self.assertIn(expected_str, cm.output[1])
|
self.assertIn(expected_str, cm.output[1])
|
||||||
|
|
||||||
@@ -1335,7 +1335,7 @@ class TestWorkflows(
|
|||||||
|
|
||||||
matched, reason = existing_document_matches_workflow(doc, trigger)
|
matched, reason = existing_document_matches_workflow(doc, trigger)
|
||||||
self.assertTrue(matched)
|
self.assertTrue(matched)
|
||||||
self.assertEqual(reason, "")
|
self.assertIsNone(reason)
|
||||||
|
|
||||||
def test_prefilter_documents_custom_field_query(self):
|
def test_prefilter_documents_custom_field_query(self):
|
||||||
trigger = WorkflowTrigger.objects.create(
|
trigger = WorkflowTrigger.objects.create(
|
||||||
|
Reference in New Issue
Block a user