Change how the workflow matching is handled, with the goal to reduce queries, make queries cheaper and reduce function complexity

This commit is contained in:
Trenton H
2025-10-10 10:42:00 -07:00
parent 914c007103
commit 7ed488faa9
2 changed files with 139 additions and 151 deletions

View File

@@ -345,157 +345,147 @@ def consumable_document_matches_workflow(
def existing_document_matches_workflow( def existing_document_matches_workflow(
document: Document, document: Document,
trigger: WorkflowTrigger, trigger: WorkflowTrigger,
) -> tuple[bool, str]: ) -> tuple[bool, str | None]:
""" """
Returns True if the Document matches all filters from the workflow trigger, Returns True if the Document matches all filters from the workflow trigger,
False otherwise. Includes a reason if doesn't match False otherwise. Includes a reason if doesn't match
""" """
trigger_matched = True # Check content matching algorithm
reason = ""
if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches( if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches(
trigger, trigger,
document, document,
): ):
reason = ( return (
False,
f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match", f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match",
) )
trigger_matched = False
# Check if any tag filters exist to determine if we need to load document tags
trigger_has_tags_qs = trigger.filter_has_tags.all()
trigger_has_all_tags_qs = trigger.filter_has_all_tags.all()
trigger_has_not_tags_qs = trigger.filter_has_not_tags.all()
has_tags_filter = trigger_has_tags_qs.exists()
has_all_tags_filter = trigger_has_all_tags_qs.exists()
has_not_tags_filter = trigger_has_not_tags_qs.exists()
# Load document tags once if any tag filters exist
document_tag_ids = None
if has_tags_filter or has_all_tags_filter or has_not_tags_filter:
document_tag_ids = set(document.tags.values_list("id", flat=True))
# Document tags vs trigger has_tags (any of) # Document tags vs trigger has_tags (any of)
if trigger.filter_has_tags.all().count() > 0 and ( if has_tags_filter:
document.tags.filter( trigger_has_tag_ids = set(trigger_has_tags_qs.values_list("id", flat=True))
id__in=trigger.filter_has_tags.all().values_list("id"), if not (document_tag_ids & trigger_has_tag_ids):
).count() # For error message, load the actual tag objects
== 0 return (
): False,
reason = ( f"Document tags {list(document.tags.all())} do not include {list(trigger_has_tags_qs)}",
f"Document tags {document.tags.all()} do not include" )
f" {trigger.filter_has_tags.all()}",
)
trigger_matched = False
# Document tags vs trigger has_all_tags (all of) # Document tags vs trigger has_all_tags (all of)
if trigger.filter_has_all_tags.all().count() > 0 and trigger_matched: if has_all_tags_filter:
required_tag_ids = set( required_tag_ids = set(trigger_has_all_tags_qs.values_list("id", flat=True))
trigger.filter_has_all_tags.all().values_list("id", flat=True), if not required_tag_ids.issubset(document_tag_ids):
) return (
document_tag_ids = set( False,
document.tags.all().values_list("id", flat=True), f"Document tags {list(document.tags.all())} do not contain all of {list(trigger_has_all_tags_qs)}",
)
missing_tags = required_tag_ids - document_tag_ids
if missing_tags:
reason = (
f"Document tags {document.tags.all()} do not contain all of"
f" {trigger.filter_has_all_tags.all()}",
) )
trigger_matched = False
# Document tags vs trigger has_not_tags (none of) # Document tags vs trigger has_not_tags (none of)
if ( if has_not_tags_filter:
trigger.filter_has_not_tags.all().count() > 0 excluded_tag_ids = set(trigger_has_not_tags_qs.values_list("id", flat=True))
and trigger_matched if document_tag_ids & excluded_tag_ids:
and document.tags.filter( return (
id__in=trigger.filter_has_not_tags.all().values_list("id"), False,
).exists() f"Document tags {list(document.tags.all())} include excluded tags {list(trigger_has_not_tags_qs)}",
): )
reason = (
f"Document tags {document.tags.all()} include excluded tags"
f" {trigger.filter_has_not_tags.all()}",
)
trigger_matched = False
# Document correspondent vs trigger has_correspondent # Document correspondent vs trigger has_correspondent
if trigger_matched: if (
if ( trigger.filter_has_correspondent_id is not None
trigger.filter_has_correspondent is not None and document.correspondent_id != trigger.filter_has_correspondent_id
and document.correspondent != trigger.filter_has_correspondent ):
): return (
reason = ( False,
f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}", f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
) )
trigger_matched = False
if ( if (
trigger.filter_has_not_correspondents.all().count() > 0 document.correspondent_id
and document.correspondent and trigger.filter_has_not_correspondents.filter(
and trigger.filter_has_not_correspondents.filter( id=document.correspondent_id,
id=document.correspondent_id, ).exists()
).exists() ):
): return (
reason = ( False,
f"Document correspondent {document.correspondent} is excluded by" f"Document correspondent {document.correspondent} is excluded by {list(trigger.filter_has_not_correspondents.all())}",
f" {trigger.filter_has_not_correspondents.all()}", )
)
trigger_matched = False
# Document document_type vs trigger has_document_type # Document document_type vs trigger has_document_type
if trigger_matched: if (
if ( trigger.filter_has_document_type_id is not None
trigger.filter_has_document_type is not None and document.document_type_id != trigger.filter_has_document_type_id
and document.document_type != trigger.filter_has_document_type ):
): return (
reason = ( False,
f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}", f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
) )
trigger_matched = False
if ( if (
trigger.filter_has_not_document_types.all().count() > 0 document.document_type_id
and document.document_type and trigger.filter_has_not_document_types.filter(
and trigger.filter_has_not_document_types.filter( id=document.document_type_id,
id=document.document_type_id, ).exists()
).exists() ):
): return (
reason = ( False,
f"Document doc type {document.document_type} is excluded by" f"Document doc type {document.document_type} is excluded by {list(trigger.filter_has_not_document_types.all())}",
f" {trigger.filter_has_not_document_types.all()}", )
)
trigger_matched = False
# Document storage_path vs trigger has_storage_path # Document storage_path vs trigger has_storage_path
if trigger_matched: if (
if ( trigger.filter_has_storage_path_id is not None
trigger.filter_has_storage_path is not None and document.storage_path_id != trigger.filter_has_storage_path_id
and document.storage_path != trigger.filter_has_storage_path ):
): return (
reason = ( False,
f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}", f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
) )
trigger_matched = False
if ( if (
trigger.filter_has_not_storage_paths.all().count() > 0 document.storage_path_id
and document.storage_path and trigger.filter_has_not_storage_paths.filter(
and trigger.filter_has_not_storage_paths.filter( id=document.storage_path_id,
id=document.storage_path_id, ).exists()
).exists() ):
): return (
reason = ( False,
f"Document storage path {document.storage_path} is excluded by" f"Document storage path {document.storage_path} is excluded by {list(trigger.filter_has_not_storage_paths.all())}",
f" {trigger.filter_has_not_storage_paths.all()}", )
)
trigger_matched = False
if trigger_matched and trigger.filter_custom_field_query: # Custom field query check
if trigger.filter_custom_field_query:
parser = CustomFieldQueryParser("filter_custom_field_query") parser = CustomFieldQueryParser("filter_custom_field_query")
try: try:
custom_field_q, annotations = parser.parse( custom_field_q, annotations = parser.parse(
trigger.filter_custom_field_query, trigger.filter_custom_field_query,
) )
except serializers.ValidationError: except serializers.ValidationError:
reason = "Invalid custom field query configuration" return (False, "Invalid custom field query configuration")
trigger_matched = False
else: qs = (
qs = ( Document.objects.filter(id=document.id)
Document.objects.filter(id=document.id) .annotate(**annotations)
.annotate(**annotations) .filter(custom_field_q)
.filter(custom_field_q) )
if not qs.exists():
return (
False,
"Document custom fields do not match the configured custom field query",
) )
if not qs.exists():
reason = "Document custom fields do not match the configured custom field query"
trigger_matched = False
# Document original_filename vs trigger filename # Document original_filename vs trigger filename
if ( if (
@@ -507,13 +497,12 @@ def existing_document_matches_workflow(
trigger.filter_filename.lower(), trigger.filter_filename.lower(),
) )
): ):
reason = ( return (
f"Document filename {document.original_filename} does not match" False,
f" {trigger.filter_filename.lower()}", f"Document filename {document.original_filename} does not match {trigger.filter_filename.lower()}",
) )
trigger_matched = False
return (trigger_matched, reason) return (True, None)
def prefilter_documents_by_workflowtrigger( def prefilter_documents_by_workflowtrigger(
@@ -526,27 +515,28 @@ def prefilter_documents_by_workflowtrigger(
document_matches_workflow in run_workflows document_matches_workflow in run_workflows
""" """
if trigger.filter_has_tags.all().count() > 0: # Filter for documents that have AT LEAST ONE of the specified tags.
documents = documents.filter( if trigger.filter_has_tags.exists():
tags__in=trigger.filter_has_tags.all(), documents = documents.filter(tags__in=trigger.filter_has_tags.all()).distinct()
).distinct()
if trigger.filter_has_all_tags.all().count() > 0: # Filter for documents that have ALL of the specified tags.
for tag_id in trigger.filter_has_all_tags.all().values_list("id", flat=True): if trigger.filter_has_all_tags.exists():
documents = documents.filter(tags__id=tag_id) for tag in trigger.filter_has_all_tags.all():
documents = documents.filter(tags=tag)
# Multiple JOINs can create duplicate results.
documents = documents.distinct() documents = documents.distinct()
if trigger.filter_has_not_tags.all().count() > 0: # Exclude documents that have ANY of the specified tags.
documents = documents.exclude( if trigger.filter_has_not_tags.exists():
tags__in=trigger.filter_has_not_tags.all(), documents = documents.exclude(tags__in=trigger.filter_has_not_tags.all())
).distinct()
# Correspondent, DocumentType, etc. filtering
if trigger.filter_has_correspondent is not None: if trigger.filter_has_correspondent is not None:
documents = documents.filter( documents = documents.filter(
correspondent=trigger.filter_has_correspondent, correspondent=trigger.filter_has_correspondent,
) )
if trigger.filter_has_not_correspondents.exists():
if trigger.filter_has_not_correspondents.all().count() > 0:
documents = documents.exclude( documents = documents.exclude(
correspondent__in=trigger.filter_has_not_correspondents.all(), correspondent__in=trigger.filter_has_not_correspondents.all(),
) )
@@ -555,8 +545,7 @@ def prefilter_documents_by_workflowtrigger(
documents = documents.filter( documents = documents.filter(
document_type=trigger.filter_has_document_type, document_type=trigger.filter_has_document_type,
) )
if trigger.filter_has_not_document_types.exists():
if trigger.filter_has_not_document_types.all().count() > 0:
documents = documents.exclude( documents = documents.exclude(
document_type__in=trigger.filter_has_not_document_types.all(), document_type__in=trigger.filter_has_not_document_types.all(),
) )
@@ -565,12 +554,13 @@ def prefilter_documents_by_workflowtrigger(
documents = documents.filter( documents = documents.filter(
storage_path=trigger.filter_has_storage_path, storage_path=trigger.filter_has_storage_path,
) )
if trigger.filter_has_not_storage_paths.exists():
if trigger.filter_has_not_storage_paths.all().count() > 0:
documents = documents.exclude( documents = documents.exclude(
storage_path__in=trigger.filter_has_not_storage_paths.all(), storage_path__in=trigger.filter_has_not_storage_paths.all(),
) )
# Custom Field & Filename Filtering
if trigger.filter_custom_field_query: if trigger.filter_custom_field_query:
parser = CustomFieldQueryParser("filter_custom_field_query") parser = CustomFieldQueryParser("filter_custom_field_query")
try: try:
@@ -582,11 +572,9 @@ def prefilter_documents_by_workflowtrigger(
documents = documents.annotate(**annotations).filter(custom_field_q) documents = documents.annotate(**annotations).filter(custom_field_q)
if trigger.filter_filename is not None and len(trigger.filter_filename) > 0: if trigger.filter_filename:
# the true fnmatch will actually run later so we just want a loose filter here
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$") regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
regex = f"(?i){regex}" documents = documents.filter(original_filename__iregex=regex)
documents = documents.filter(original_filename__regex=regex)
return documents return documents

View File

@@ -1083,7 +1083,7 @@ class TestWorkflows(
) )
expected_str = f"Document did not match {w}" expected_str = f"Document did not match {w}"
self.assertIn(expected_str, cm.output[0]) self.assertIn(expected_str, cm.output[0])
expected_str = f"Document tags {doc.tags.all()} do not include {trigger.filter_has_tags.all()}" expected_str = f"Document tags {list(doc.tags.all())} do not include {list(trigger.filter_has_tags.all())}"
self.assertIn(expected_str, cm.output[1]) self.assertIn(expected_str, cm.output[1])
def test_document_added_no_match_all_tags(self): def test_document_added_no_match_all_tags(self):
@@ -1119,8 +1119,8 @@ class TestWorkflows(
expected_str = f"Document did not match {w}" expected_str = f"Document did not match {w}"
self.assertIn(expected_str, cm.output[0]) self.assertIn(expected_str, cm.output[0])
expected_str = ( expected_str = (
f"Document tags {doc.tags.all()} do not contain all of" f"Document tags {list(doc.tags.all())} do not contain all of"
f" {trigger.filter_has_all_tags.all()}" f" {list(trigger.filter_has_all_tags.all())}"
) )
self.assertIn(expected_str, cm.output[1]) self.assertIn(expected_str, cm.output[1])
@@ -1157,8 +1157,8 @@ class TestWorkflows(
expected_str = f"Document did not match {w}" expected_str = f"Document did not match {w}"
self.assertIn(expected_str, cm.output[0]) self.assertIn(expected_str, cm.output[0])
expected_str = ( expected_str = (
f"Document tags {doc.tags.all()} include excluded tags" f"Document tags {list(doc.tags.all())} include excluded tags"
f" {trigger.filter_has_not_tags.all()}" f" {list(trigger.filter_has_not_tags.all())}"
) )
self.assertIn(expected_str, cm.output[1]) self.assertIn(expected_str, cm.output[1])
@@ -1194,7 +1194,7 @@ class TestWorkflows(
self.assertIn(expected_str, cm.output[0]) self.assertIn(expected_str, cm.output[0])
expected_str = ( expected_str = (
f"Document correspondent {doc.correspondent} is excluded by" f"Document correspondent {doc.correspondent} is excluded by"
f" {trigger.filter_has_not_correspondents.all()}" f" {list(trigger.filter_has_not_correspondents.all())}"
) )
self.assertIn(expected_str, cm.output[1]) self.assertIn(expected_str, cm.output[1])
@@ -1230,7 +1230,7 @@ class TestWorkflows(
self.assertIn(expected_str, cm.output[0]) self.assertIn(expected_str, cm.output[0])
expected_str = ( expected_str = (
f"Document doc type {doc.document_type} is excluded by" f"Document doc type {doc.document_type} is excluded by"
f" {trigger.filter_has_not_document_types.all()}" f" {list(trigger.filter_has_not_document_types.all())}"
) )
self.assertIn(expected_str, cm.output[1]) self.assertIn(expected_str, cm.output[1])
@@ -1266,7 +1266,7 @@ class TestWorkflows(
self.assertIn(expected_str, cm.output[0]) self.assertIn(expected_str, cm.output[0])
expected_str = ( expected_str = (
f"Document storage path {doc.storage_path} is excluded by" f"Document storage path {doc.storage_path} is excluded by"
f" {trigger.filter_has_not_storage_paths.all()}" f" {list(trigger.filter_has_not_storage_paths.all())}"
) )
self.assertIn(expected_str, cm.output[1]) self.assertIn(expected_str, cm.output[1])
@@ -1335,7 +1335,7 @@ class TestWorkflows(
matched, reason = existing_document_matches_workflow(doc, trigger) matched, reason = existing_document_matches_workflow(doc, trigger)
self.assertTrue(matched) self.assertTrue(matched)
self.assertEqual(reason, "") self.assertIsNone(reason)
def test_prefilter_documents_custom_field_query(self): def test_prefilter_documents_custom_field_query(self):
trigger = WorkflowTrigger.objects.create( trigger = WorkflowTrigger.objects.create(