Change how the workflow matching is handled, with the goal to reduce queries, make queries cheaper and reduce function complexity

2025-12-24 02:05:48 -06:00 · 2025-10-10 10:42:00 -07:00
parent 914c007103
commit 7ed488faa9
2 changed files with 139 additions and 151 deletions
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -345,157 +345,147 @@ def consumable_document_matches_workflow(
 def existing_document_matches_workflow(
    document: Document,
    trigger: WorkflowTrigger,
-) -> tuple[bool, str]:
+) -> tuple[bool, str | None]:
    """
    Returns True if the Document matches all filters from the workflow trigger,
    False otherwise. Includes a reason if doesn't match
    """
-    trigger_matched = True
+    # Check content matching algorithm
    reason = ""
    if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches(
        trigger,
        document,
    ):
-        reason = (
+        return (
            False,
            f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match",
        )
-        trigger_matched = False
+
    # Check if any tag filters exist to determine if we need to load document tags
    trigger_has_tags_qs = trigger.filter_has_tags.all()
    trigger_has_all_tags_qs = trigger.filter_has_all_tags.all()
    trigger_has_not_tags_qs = trigger.filter_has_not_tags.all()
    has_tags_filter = trigger_has_tags_qs.exists()
    has_all_tags_filter = trigger_has_all_tags_qs.exists()
    has_not_tags_filter = trigger_has_not_tags_qs.exists()
    # Load document tags once if any tag filters exist
    document_tag_ids = None
    if has_tags_filter or has_all_tags_filter or has_not_tags_filter:
        document_tag_ids = set(document.tags.values_list("id", flat=True))
    # Document tags vs trigger has_tags (any of)
-    if trigger.filter_has_tags.all().count() > 0 and (
+    if has_tags_filter:
-        document.tags.filter(
+        trigger_has_tag_ids = set(trigger_has_tags_qs.values_list("id", flat=True))
-            id__in=trigger.filter_has_tags.all().values_list("id"),
+        if not (document_tag_ids & trigger_has_tag_ids):
-        ).count()
+            # For error message, load the actual tag objects
-        == 0
+            return (
-    ):
+                False,
-        reason = (
+                f"Document tags {list(document.tags.all())} do not include {list(trigger_has_tags_qs)}",
-            f"Document tags {document.tags.all()} do not include"
+            )
            f" {trigger.filter_has_tags.all()}",
        )
        trigger_matched = False
    # Document tags vs trigger has_all_tags (all of)
-    if trigger.filter_has_all_tags.all().count() > 0 and trigger_matched:
+    if has_all_tags_filter:
-        required_tag_ids = set(
+        required_tag_ids = set(trigger_has_all_tags_qs.values_list("id", flat=True))
-            trigger.filter_has_all_tags.all().values_list("id", flat=True),
+        if not required_tag_ids.issubset(document_tag_ids):
-        )
+            return (
-        document_tag_ids = set(
+                False,
-            document.tags.all().values_list("id", flat=True),
+                f"Document tags {list(document.tags.all())} do not contain all of {list(trigger_has_all_tags_qs)}",
        )
        missing_tags = required_tag_ids - document_tag_ids
        if missing_tags:
            reason = (
                f"Document tags {document.tags.all()} do not contain all of"
                f" {trigger.filter_has_all_tags.all()}",
            )
            trigger_matched = False
    # Document tags vs trigger has_not_tags (none of)
-    if (
+    if has_not_tags_filter:
-        trigger.filter_has_not_tags.all().count() > 0
+        excluded_tag_ids = set(trigger_has_not_tags_qs.values_list("id", flat=True))
-        and trigger_matched
+        if document_tag_ids & excluded_tag_ids:
-        and document.tags.filter(
+            return (
-            id__in=trigger.filter_has_not_tags.all().values_list("id"),
+                False,
-        ).exists()
+                f"Document tags {list(document.tags.all())} include excluded tags {list(trigger_has_not_tags_qs)}",
-    ):
+            )
        reason = (
            f"Document tags {document.tags.all()} include excluded tags"
            f" {trigger.filter_has_not_tags.all()}",
        )
        trigger_matched = False
    # Document correspondent vs trigger has_correspondent
-    if trigger_matched:
+    if (
-        if (
+        trigger.filter_has_correspondent_id is not None
-            trigger.filter_has_correspondent is not None
+        and document.correspondent_id != trigger.filter_has_correspondent_id
-            and document.correspondent != trigger.filter_has_correspondent
+    ):
-        ):
+        return (
-            reason = (
+            False,
-                f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
+            f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
-            )
+        )
            trigger_matched = False
-        if (
+    if (
-            trigger.filter_has_not_correspondents.all().count() > 0
+        document.correspondent_id
-            and document.correspondent
+        and trigger.filter_has_not_correspondents.filter(
-            and trigger.filter_has_not_correspondents.filter(
+            id=document.correspondent_id,
-                id=document.correspondent_id,
+        ).exists()
-            ).exists()
+    ):
-        ):
+        return (
-            reason = (
+            False,
-                f"Document correspondent {document.correspondent} is excluded by"
+            f"Document correspondent {document.correspondent} is excluded by {list(trigger.filter_has_not_correspondents.all())}",
-                f" {trigger.filter_has_not_correspondents.all()}",
+        )
            )
            trigger_matched = False
    # Document document_type vs trigger has_document_type
-    if trigger_matched:
+    if (
-        if (
+        trigger.filter_has_document_type_id is not None
-            trigger.filter_has_document_type is not None
+        and document.document_type_id != trigger.filter_has_document_type_id
-            and document.document_type != trigger.filter_has_document_type
+    ):
-        ):
+        return (
-            reason = (
+            False,
-                f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
+            f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
-            )
+        )
            trigger_matched = False
-        if (
+    if (
-            trigger.filter_has_not_document_types.all().count() > 0
+        document.document_type_id
-            and document.document_type
+        and trigger.filter_has_not_document_types.filter(
-            and trigger.filter_has_not_document_types.filter(
+            id=document.document_type_id,
-                id=document.document_type_id,
+        ).exists()
-            ).exists()
+    ):
-        ):
+        return (
-            reason = (
+            False,
-                f"Document doc type {document.document_type} is excluded by"
+            f"Document doc type {document.document_type} is excluded by {list(trigger.filter_has_not_document_types.all())}",
-                f" {trigger.filter_has_not_document_types.all()}",
+        )
            )
            trigger_matched = False
    # Document storage_path vs trigger has_storage_path
-    if trigger_matched:
+    if (
-        if (
+        trigger.filter_has_storage_path_id is not None
-            trigger.filter_has_storage_path is not None
+        and document.storage_path_id != trigger.filter_has_storage_path_id
-            and document.storage_path != trigger.filter_has_storage_path
+    ):
-        ):
+        return (
-            reason = (
+            False,
-                f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
+            f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
-            )
+        )
            trigger_matched = False
-        if (
+    if (
-            trigger.filter_has_not_storage_paths.all().count() > 0
+        document.storage_path_id
-            and document.storage_path
+        and trigger.filter_has_not_storage_paths.filter(
-            and trigger.filter_has_not_storage_paths.filter(
+            id=document.storage_path_id,
-                id=document.storage_path_id,
+        ).exists()
-            ).exists()
+    ):
-        ):
+        return (
-            reason = (
+            False,
-                f"Document storage path {document.storage_path} is excluded by"
+            f"Document storage path {document.storage_path} is excluded by {list(trigger.filter_has_not_storage_paths.all())}",
-                f" {trigger.filter_has_not_storage_paths.all()}",
+        )
            )
            trigger_matched = False
-    if trigger_matched and trigger.filter_custom_field_query:
+    # Custom field query check
    if trigger.filter_custom_field_query:
        parser = CustomFieldQueryParser("filter_custom_field_query")
        try:
            custom_field_q, annotations = parser.parse(
                trigger.filter_custom_field_query,
            )
        except serializers.ValidationError:
-            reason = "Invalid custom field query configuration"
+            return (False, "Invalid custom field query configuration")
-            trigger_matched = False
+
-        else:
+        qs = (
-            qs = (
+            Document.objects.filter(id=document.id)
-                Document.objects.filter(id=document.id)
+            .annotate(**annotations)
-                .annotate(**annotations)
+            .filter(custom_field_q)
-                .filter(custom_field_q)
+        )
        if not qs.exists():
            return (
                False,
                "Document custom fields do not match the configured custom field query",
            )
            if not qs.exists():
                reason = "Document custom fields do not match the configured custom field query"
                trigger_matched = False
    # Document original_filename vs trigger filename
    if (
@@ -507,13 +497,12 @@ def existing_document_matches_workflow(
            trigger.filter_filename.lower(),
        )
    ):
-        reason = (
+        return (
-            f"Document filename {document.original_filename} does not match"
+            False,
-            f" {trigger.filter_filename.lower()}",
+            f"Document filename {document.original_filename} does not match {trigger.filter_filename.lower()}",
        )
        trigger_matched = False
-    return (trigger_matched, reason)
+    return (True, None)
 def prefilter_documents_by_workflowtrigger(
@@ -526,27 +515,28 @@ def prefilter_documents_by_workflowtrigger(
    document_matches_workflow in run_workflows
    """
-    if trigger.filter_has_tags.all().count() > 0:
+    # Filter for documents that have AT LEAST ONE of the specified tags.
-        documents = documents.filter(
+    if trigger.filter_has_tags.exists():
-            tags__in=trigger.filter_has_tags.all(),
+        documents = documents.filter(tags__in=trigger.filter_has_tags.all()).distinct()
        ).distinct()
-    if trigger.filter_has_all_tags.all().count() > 0:
+    # Filter for documents that have ALL of the specified tags.
-        for tag_id in trigger.filter_has_all_tags.all().values_list("id", flat=True):
+    if trigger.filter_has_all_tags.exists():
-            documents = documents.filter(tags__id=tag_id)
+        for tag in trigger.filter_has_all_tags.all():
            documents = documents.filter(tags=tag)
        # Multiple JOINs can create duplicate results.
        documents = documents.distinct()
-    if trigger.filter_has_not_tags.all().count() > 0:
+    # Exclude documents that have ANY of the specified tags.
-        documents = documents.exclude(
+    if trigger.filter_has_not_tags.exists():
-            tags__in=trigger.filter_has_not_tags.all(),
+        documents = documents.exclude(tags__in=trigger.filter_has_not_tags.all())
-        ).distinct()
+
    # Correspondent, DocumentType, etc. filtering
    if trigger.filter_has_correspondent is not None:
        documents = documents.filter(
            correspondent=trigger.filter_has_correspondent,
        )
-
+    if trigger.filter_has_not_correspondents.exists():
    if trigger.filter_has_not_correspondents.all().count() > 0:
        documents = documents.exclude(
            correspondent__in=trigger.filter_has_not_correspondents.all(),
        )
@@ -555,8 +545,7 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.filter(
            document_type=trigger.filter_has_document_type,
        )
-
+    if trigger.filter_has_not_document_types.exists():
    if trigger.filter_has_not_document_types.all().count() > 0:
        documents = documents.exclude(
            document_type__in=trigger.filter_has_not_document_types.all(),
        )
@@ -565,12 +554,13 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.filter(
            storage_path=trigger.filter_has_storage_path,
        )
-
+    if trigger.filter_has_not_storage_paths.exists():
    if trigger.filter_has_not_storage_paths.all().count() > 0:
        documents = documents.exclude(
            storage_path__in=trigger.filter_has_not_storage_paths.all(),
        )
    # Custom Field & Filename Filtering
    if trigger.filter_custom_field_query:
        parser = CustomFieldQueryParser("filter_custom_field_query")
        try:
@@ -582,11 +572,9 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.annotate(**annotations).filter(custom_field_q)
-    if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
+    if trigger.filter_filename:
        # the true fnmatch will actually run later so we just want a loose filter here
        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
-        regex = f"(?i){regex}"
+        documents = documents.filter(original_filename__iregex=regex)
        documents = documents.filter(original_filename__regex=regex)
    return documents
--- a/src/documents/tests/test_workflows.py
+++ b/src/documents/tests/test_workflows.py
@@ -1083,7 +1083,7 @@ class TestWorkflows(
            )
            expected_str = f"Document did not match {w}"
            self.assertIn(expected_str, cm.output[0])
-            expected_str = f"Document tags {doc.tags.all()} do not include {trigger.filter_has_tags.all()}"
+            expected_str = f"Document tags {list(doc.tags.all())} do not include {list(trigger.filter_has_tags.all())}"
            self.assertIn(expected_str, cm.output[1])
    def test_document_added_no_match_all_tags(self):
@@ -1119,8 +1119,8 @@ class TestWorkflows(
            expected_str = f"Document did not match {w}"
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
-                f"Document tags {doc.tags.all()} do not contain all of"
+                f"Document tags {list(doc.tags.all())} do not contain all of"
-                f" {trigger.filter_has_all_tags.all()}"
+                f" {list(trigger.filter_has_all_tags.all())}"
            )
            self.assertIn(expected_str, cm.output[1])
@@ -1157,8 +1157,8 @@ class TestWorkflows(
            expected_str = f"Document did not match {w}"
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
-                f"Document tags {doc.tags.all()} include excluded tags"
+                f"Document tags {list(doc.tags.all())} include excluded tags"
-                f" {trigger.filter_has_not_tags.all()}"
+                f" {list(trigger.filter_has_not_tags.all())}"
            )
            self.assertIn(expected_str, cm.output[1])
@@ -1194,7 +1194,7 @@ class TestWorkflows(
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
                f"Document correspondent {doc.correspondent} is excluded by"
-                f" {trigger.filter_has_not_correspondents.all()}"
+                f" {list(trigger.filter_has_not_correspondents.all())}"
            )
            self.assertIn(expected_str, cm.output[1])
@@ -1230,7 +1230,7 @@ class TestWorkflows(
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
                f"Document doc type {doc.document_type} is excluded by"
-                f" {trigger.filter_has_not_document_types.all()}"
+                f" {list(trigger.filter_has_not_document_types.all())}"
            )
            self.assertIn(expected_str, cm.output[1])
@@ -1266,7 +1266,7 @@ class TestWorkflows(
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
                f"Document storage path {doc.storage_path} is excluded by"
-                f" {trigger.filter_has_not_storage_paths.all()}"
+                f" {list(trigger.filter_has_not_storage_paths.all())}"
            )
            self.assertIn(expected_str, cm.output[1])
@@ -1335,7 +1335,7 @@ class TestWorkflows(
        matched, reason = existing_document_matches_workflow(doc, trigger)
        self.assertTrue(matched)
-        self.assertEqual(reason, "")
+        self.assertIsNone(reason)
    def test_prefilter_documents_custom_field_query(self):
        trigger = WorkflowTrigger.objects.create(