Change how the workflow matching is handled, with the goal to reduce queries, make queries cheaper and reduce function complexity

2025-11-09 03:46:12 -06:00 · 2025-10-10 10:42:00 -07:00
parent 914c007103
commit 7ed488faa9
2 changed files with 139 additions and 151 deletions
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -345,157 +345,147 @@ def consumable_document_matches_workflow(
 def existing_document_matches_workflow(
    document: Document,
    trigger: WorkflowTrigger,
-) -> tuple[bool, str]:
+) -> tuple[bool, str | None]:
    """
    Returns True if the Document matches all filters from the workflow trigger,
    False otherwise. Includes a reason if doesn't match
    """

-    trigger_matched = True
-    reason = ""
-
+    # Check content matching algorithm
    if trigger.matching_algorithm > MatchingModel.MATCH_NONE and not matches(
        trigger,
        document,
    ):
-        reason = (
+        return (
+            False,
            f"Document content matching settings for algorithm '{trigger.matching_algorithm}' did not match",
        )
-        trigger_matched = False
+
+    # Check if any tag filters exist to determine if we need to load document tags
+    trigger_has_tags_qs = trigger.filter_has_tags.all()
+    trigger_has_all_tags_qs = trigger.filter_has_all_tags.all()
+    trigger_has_not_tags_qs = trigger.filter_has_not_tags.all()
+
+    has_tags_filter = trigger_has_tags_qs.exists()
+    has_all_tags_filter = trigger_has_all_tags_qs.exists()
+    has_not_tags_filter = trigger_has_not_tags_qs.exists()
+
+    # Load document tags once if any tag filters exist
+    document_tag_ids = None
+    if has_tags_filter or has_all_tags_filter or has_not_tags_filter:
+        document_tag_ids = set(document.tags.values_list("id", flat=True))

    # Document tags vs trigger has_tags (any of)
-    if trigger.filter_has_tags.all().count() > 0 and (
-        document.tags.filter(
-            id__in=trigger.filter_has_tags.all().values_list("id"),
-        ).count()
-        == 0
-    ):
-        reason = (
-            f"Document tags {document.tags.all()} do not include"
-            f" {trigger.filter_has_tags.all()}",
-        )
-        trigger_matched = False
+    if has_tags_filter:
+        trigger_has_tag_ids = set(trigger_has_tags_qs.values_list("id", flat=True))
+        if not (document_tag_ids & trigger_has_tag_ids):
+            # For error message, load the actual tag objects
+            return (
+                False,
+                f"Document tags {list(document.tags.all())} do not include {list(trigger_has_tags_qs)}",
+            )

    # Document tags vs trigger has_all_tags (all of)
-    if trigger.filter_has_all_tags.all().count() > 0 and trigger_matched:
-        required_tag_ids = set(
-            trigger.filter_has_all_tags.all().values_list("id", flat=True),
-        )
-        document_tag_ids = set(
-            document.tags.all().values_list("id", flat=True),
-        )
-        missing_tags = required_tag_ids - document_tag_ids
-        if missing_tags:
-            reason = (
-                f"Document tags {document.tags.all()} do not contain all of"
-                f" {trigger.filter_has_all_tags.all()}",
+    if has_all_tags_filter:
+        required_tag_ids = set(trigger_has_all_tags_qs.values_list("id", flat=True))
+        if not required_tag_ids.issubset(document_tag_ids):
+            return (
+                False,
+                f"Document tags {list(document.tags.all())} do not contain all of {list(trigger_has_all_tags_qs)}",
            )
-            trigger_matched = False

    # Document tags vs trigger has_not_tags (none of)
-    if (
-        trigger.filter_has_not_tags.all().count() > 0
-        and trigger_matched
-        and document.tags.filter(
-            id__in=trigger.filter_has_not_tags.all().values_list("id"),
-        ).exists()
-    ):
-        reason = (
-            f"Document tags {document.tags.all()} include excluded tags"
-            f" {trigger.filter_has_not_tags.all()}",
-        )
-        trigger_matched = False
+    if has_not_tags_filter:
+        excluded_tag_ids = set(trigger_has_not_tags_qs.values_list("id", flat=True))
+        if document_tag_ids & excluded_tag_ids:
+            return (
+                False,
+                f"Document tags {list(document.tags.all())} include excluded tags {list(trigger_has_not_tags_qs)}",
+            )

    # Document correspondent vs trigger has_correspondent
-    if trigger_matched:
-        if (
-            trigger.filter_has_correspondent is not None
-            and document.correspondent != trigger.filter_has_correspondent
-        ):
-            reason = (
-                f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
-            )
-            trigger_matched = False
+    if (
+        trigger.filter_has_correspondent_id is not None
+        and document.correspondent_id != trigger.filter_has_correspondent_id
+    ):
+        return (
+            False,
+            f"Document correspondent {document.correspondent} does not match {trigger.filter_has_correspondent}",
+        )

-        if (
-            trigger.filter_has_not_correspondents.all().count() > 0
-            and document.correspondent
-            and trigger.filter_has_not_correspondents.filter(
-                id=document.correspondent_id,
-            ).exists()
-        ):
-            reason = (
-                f"Document correspondent {document.correspondent} is excluded by"
-                f" {trigger.filter_has_not_correspondents.all()}",
-            )
-            trigger_matched = False
+    if (
+        document.correspondent_id
+        and trigger.filter_has_not_correspondents.filter(
+            id=document.correspondent_id,
+        ).exists()
+    ):
+        return (
+            False,
+            f"Document correspondent {document.correspondent} is excluded by {list(trigger.filter_has_not_correspondents.all())}",
+        )

    # Document document_type vs trigger has_document_type
-    if trigger_matched:
-        if (
-            trigger.filter_has_document_type is not None
-            and document.document_type != trigger.filter_has_document_type
-        ):
-            reason = (
-                f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
-            )
-            trigger_matched = False
+    if (
+        trigger.filter_has_document_type_id is not None
+        and document.document_type_id != trigger.filter_has_document_type_id
+    ):
+        return (
+            False,
+            f"Document doc type {document.document_type} does not match {trigger.filter_has_document_type}",
+        )

-        if (
-            trigger.filter_has_not_document_types.all().count() > 0
-            and document.document_type
-            and trigger.filter_has_not_document_types.filter(
-                id=document.document_type_id,
-            ).exists()
-        ):
-            reason = (
-                f"Document doc type {document.document_type} is excluded by"
-                f" {trigger.filter_has_not_document_types.all()}",
-            )
-            trigger_matched = False
+    if (
+        document.document_type_id
+        and trigger.filter_has_not_document_types.filter(
+            id=document.document_type_id,
+        ).exists()
+    ):
+        return (
+            False,
+            f"Document doc type {document.document_type} is excluded by {list(trigger.filter_has_not_document_types.all())}",
+        )

    # Document storage_path vs trigger has_storage_path
-    if trigger_matched:
-        if (
-            trigger.filter_has_storage_path is not None
-            and document.storage_path != trigger.filter_has_storage_path
-        ):
-            reason = (
-                f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
-            )
-            trigger_matched = False
+    if (
+        trigger.filter_has_storage_path_id is not None
+        and document.storage_path_id != trigger.filter_has_storage_path_id
+    ):
+        return (
+            False,
+            f"Document storage path {document.storage_path} does not match {trigger.filter_has_storage_path}",
+        )

-        if (
-            trigger.filter_has_not_storage_paths.all().count() > 0
-            and document.storage_path
-            and trigger.filter_has_not_storage_paths.filter(
-                id=document.storage_path_id,
-            ).exists()
-        ):
-            reason = (
-                f"Document storage path {document.storage_path} is excluded by"
-                f" {trigger.filter_has_not_storage_paths.all()}",
-            )
-            trigger_matched = False
+    if (
+        document.storage_path_id
+        and trigger.filter_has_not_storage_paths.filter(
+            id=document.storage_path_id,
+        ).exists()
+    ):
+        return (
+            False,
+            f"Document storage path {document.storage_path} is excluded by {list(trigger.filter_has_not_storage_paths.all())}",
+        )

-    if trigger_matched and trigger.filter_custom_field_query:
+    # Custom field query check
+    if trigger.filter_custom_field_query:
        parser = CustomFieldQueryParser("filter_custom_field_query")
        try:
            custom_field_q, annotations = parser.parse(
                trigger.filter_custom_field_query,
            )
        except serializers.ValidationError:
-            reason = "Invalid custom field query configuration"
-            trigger_matched = False
-        else:
-            qs = (
-                Document.objects.filter(id=document.id)
-                .annotate(**annotations)
-                .filter(custom_field_q)
+            return (False, "Invalid custom field query configuration")
+
+        qs = (
+            Document.objects.filter(id=document.id)
+            .annotate(**annotations)
+            .filter(custom_field_q)
+        )
+        if not qs.exists():
+            return (
+                False,
+                "Document custom fields do not match the configured custom field query",
            )
-            if not qs.exists():
-                reason = "Document custom fields do not match the configured custom field query"
-                trigger_matched = False

    # Document original_filename vs trigger filename
    if (
@@ -507,13 +497,12 @@ def existing_document_matches_workflow(
            trigger.filter_filename.lower(),
        )
    ):
-        reason = (
-            f"Document filename {document.original_filename} does not match"
-            f" {trigger.filter_filename.lower()}",
+        return (
+            False,
+            f"Document filename {document.original_filename} does not match {trigger.filter_filename.lower()}",
        )
-        trigger_matched = False

-    return (trigger_matched, reason)
+    return (True, None)


 def prefilter_documents_by_workflowtrigger(
@@ -526,27 +515,28 @@ def prefilter_documents_by_workflowtrigger(
    document_matches_workflow in run_workflows
    """

-    if trigger.filter_has_tags.all().count() > 0:
-        documents = documents.filter(
-            tags__in=trigger.filter_has_tags.all(),
-        ).distinct()
+    # Filter for documents that have AT LEAST ONE of the specified tags.
+    if trigger.filter_has_tags.exists():
+        documents = documents.filter(tags__in=trigger.filter_has_tags.all()).distinct()

-    if trigger.filter_has_all_tags.all().count() > 0:
-        for tag_id in trigger.filter_has_all_tags.all().values_list("id", flat=True):
-            documents = documents.filter(tags__id=tag_id)
+    # Filter for documents that have ALL of the specified tags.
+    if trigger.filter_has_all_tags.exists():
+        for tag in trigger.filter_has_all_tags.all():
+            documents = documents.filter(tags=tag)
+        # Multiple JOINs can create duplicate results.
        documents = documents.distinct()

-    if trigger.filter_has_not_tags.all().count() > 0:
-        documents = documents.exclude(
-            tags__in=trigger.filter_has_not_tags.all(),
-        ).distinct()
+    # Exclude documents that have ANY of the specified tags.
+    if trigger.filter_has_not_tags.exists():
+        documents = documents.exclude(tags__in=trigger.filter_has_not_tags.all())
+
+    # Correspondent, DocumentType, etc. filtering

    if trigger.filter_has_correspondent is not None:
        documents = documents.filter(
            correspondent=trigger.filter_has_correspondent,
        )
-
-    if trigger.filter_has_not_correspondents.all().count() > 0:
+    if trigger.filter_has_not_correspondents.exists():
        documents = documents.exclude(
            correspondent__in=trigger.filter_has_not_correspondents.all(),
        )
@@ -555,8 +545,7 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.filter(
            document_type=trigger.filter_has_document_type,
        )
-
-    if trigger.filter_has_not_document_types.all().count() > 0:
+    if trigger.filter_has_not_document_types.exists():
        documents = documents.exclude(
            document_type__in=trigger.filter_has_not_document_types.all(),
        )
@@ -565,12 +554,13 @@ def prefilter_documents_by_workflowtrigger(
        documents = documents.filter(
            storage_path=trigger.filter_has_storage_path,
        )
-
-    if trigger.filter_has_not_storage_paths.all().count() > 0:
+    if trigger.filter_has_not_storage_paths.exists():
        documents = documents.exclude(
            storage_path__in=trigger.filter_has_not_storage_paths.all(),
        )

+    # Custom Field & Filename Filtering
+
    if trigger.filter_custom_field_query:
        parser = CustomFieldQueryParser("filter_custom_field_query")
        try:
@@ -582,11 +572,9 @@ def prefilter_documents_by_workflowtrigger(

        documents = documents.annotate(**annotations).filter(custom_field_q)

-    if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
-        # the true fnmatch will actually run later so we just want a loose filter here
+    if trigger.filter_filename:
        regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
-        regex = f"(?i){regex}"
-        documents = documents.filter(original_filename__regex=regex)
+        documents = documents.filter(original_filename__iregex=regex)

    return documents

--- a/src/documents/tests/test_workflows.py
+++ b/src/documents/tests/test_workflows.py
@@ -1083,7 +1083,7 @@ class TestWorkflows(
            )
            expected_str = f"Document did not match {w}"
            self.assertIn(expected_str, cm.output[0])
-            expected_str = f"Document tags {doc.tags.all()} do not include {trigger.filter_has_tags.all()}"
+            expected_str = f"Document tags {list(doc.tags.all())} do not include {list(trigger.filter_has_tags.all())}"
            self.assertIn(expected_str, cm.output[1])

    def test_document_added_no_match_all_tags(self):
@@ -1119,8 +1119,8 @@ class TestWorkflows(
            expected_str = f"Document did not match {w}"
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
-                f"Document tags {doc.tags.all()} do not contain all of"
-                f" {trigger.filter_has_all_tags.all()}"
+                f"Document tags {list(doc.tags.all())} do not contain all of"
+                f" {list(trigger.filter_has_all_tags.all())}"
            )
            self.assertIn(expected_str, cm.output[1])

@@ -1157,8 +1157,8 @@ class TestWorkflows(
            expected_str = f"Document did not match {w}"
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
-                f"Document tags {doc.tags.all()} include excluded tags"
-                f" {trigger.filter_has_not_tags.all()}"
+                f"Document tags {list(doc.tags.all())} include excluded tags"
+                f" {list(trigger.filter_has_not_tags.all())}"
            )
            self.assertIn(expected_str, cm.output[1])

@@ -1194,7 +1194,7 @@ class TestWorkflows(
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
                f"Document correspondent {doc.correspondent} is excluded by"
-                f" {trigger.filter_has_not_correspondents.all()}"
+                f" {list(trigger.filter_has_not_correspondents.all())}"
            )
            self.assertIn(expected_str, cm.output[1])

@@ -1230,7 +1230,7 @@ class TestWorkflows(
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
                f"Document doc type {doc.document_type} is excluded by"
-                f" {trigger.filter_has_not_document_types.all()}"
+                f" {list(trigger.filter_has_not_document_types.all())}"
            )
            self.assertIn(expected_str, cm.output[1])

@@ -1266,7 +1266,7 @@ class TestWorkflows(
            self.assertIn(expected_str, cm.output[0])
            expected_str = (
                f"Document storage path {doc.storage_path} is excluded by"
-                f" {trigger.filter_has_not_storage_paths.all()}"
+                f" {list(trigger.filter_has_not_storage_paths.all())}"
            )
            self.assertIn(expected_str, cm.output[1])

@@ -1335,7 +1335,7 @@ class TestWorkflows(

        matched, reason = existing_document_matches_workflow(doc, trigger)
        self.assertTrue(matched)
-        self.assertEqual(reason, "")
+        self.assertIsNone(reason)

    def test_prefilter_documents_custom_field_query(self):
        trigger = WorkflowTrigger.objects.create(