From 422bffe1a60d2563a4ef35628504efb341318f9a Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue, 3 Jun 2025 14:47:29 -0700 Subject: [PATCH] Performance: pre-filter document list in scheduled workflow checks (#10031) --- src/documents/matching.py | 37 ++++++++++++++++++++ src/documents/tasks.py | 7 ++++ src/documents/tests/test_workflows.py | 50 +++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) diff --git a/src/documents/matching.py b/src/documents/matching.py index ab3866518..15a8ec443 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging import re from fnmatch import fnmatch +from fnmatch import translate as fnmatch_translate from typing import TYPE_CHECKING from documents.data_models import ConsumableDocument @@ -18,6 +19,8 @@ from documents.models import WorkflowTrigger from documents.permissions import get_objects_for_user_owner_aware if TYPE_CHECKING: + from django.db.models import QuerySet + from documents.classifier import DocumentClassifier logger = logging.getLogger("paperless.matching") @@ -389,6 +392,40 @@ def existing_document_matches_workflow( return (trigger_matched, reason) +def prefilter_documents_by_workflowtrigger( + documents: QuerySet[Document], + trigger: WorkflowTrigger, +) -> QuerySet[Document]: + """ + To prevent scheduled workflows checking every document, we prefilter the + documents by the workflow trigger filters. This is done before e.g. + document_matches_workflow in run_workflows + """ + + if trigger.filter_has_tags.all().count() > 0: + documents = documents.filter( + tags__in=trigger.filter_has_tags.all(), + ).distinct() + + if trigger.filter_has_correspondent is not None: + documents = documents.filter( + correspondent=trigger.filter_has_correspondent, + ) + + if trigger.filter_has_document_type is not None: + documents = documents.filter( + document_type=trigger.filter_has_document_type, + ) + + if trigger.filter_filename is not None and len(trigger.filter_filename) > 0: + # the true fnmatch will actually run later so we just want a loose filter here + regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$") + regex = f"(?i){regex}" + documents = documents.filter(original_filename__regex=regex) + + return documents + + def document_matches_workflow( document: ConsumableDocument | Document, workflow: Workflow, diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 1d4b33ff3..2ab5ab1cb 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -33,6 +33,7 @@ from documents.data_models import DocumentMetadataOverrides from documents.double_sided import CollatePlugin from documents.file_handling import create_source_path_directory from documents.file_handling import generate_unique_filename +from documents.matching import prefilter_documents_by_workflowtrigger from documents.models import Correspondent from documents.models import CustomFieldInstance from documents.models import Document @@ -473,6 +474,12 @@ def check_scheduled_workflows(): documents = Document.objects.filter(id__in=matched_ids) + if documents.count() > 0: + documents = prefilter_documents_by_workflowtrigger( + documents, + trigger, + ) + if documents.count() > 0: logger.debug( f"Found {documents.count()} documents for trigger {trigger}", diff --git a/src/documents/tests/test_workflows.py b/src/documents/tests/test_workflows.py index 51ea2be23..b577eeeb4 100644 --- a/src/documents/tests/test_workflows.py +++ b/src/documents/tests/test_workflows.py @@ -25,6 +25,7 @@ from documents import tasks from documents.data_models import ConsumableDocument from documents.data_models import DocumentSource from documents.matching import document_matches_workflow +from documents.matching import prefilter_documents_by_workflowtrigger from documents.models import Correspondent from documents.models import CustomField from documents.models import CustomFieldInstance @@ -1711,6 +1712,55 @@ class TestWorkflows( doc2.refresh_from_db() self.assertIsNone(doc2.owner) # has not triggered yet + def test_workflow_scheduled_filters_queryset(self): + """ + GIVEN: + - Existing workflow with scheduled trigger + WHEN: + - Workflows run and matching documents are found + THEN: + - prefilter_documents_by_workflowtrigger appropriately filters + """ + trigger = WorkflowTrigger.objects.create( + type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED, + schedule_offset_days=-7, + schedule_date_field=WorkflowTrigger.ScheduleDateField.CREATED, + filter_filename="*sample*", + filter_has_document_type=self.dt, + filter_has_correspondent=self.c, + ) + trigger.filter_has_tags.set([self.t1]) + trigger.save() + action = WorkflowAction.objects.create( + assign_owner=self.user2, + ) + w = Workflow.objects.create( + name="Workflow 1", + order=0, + ) + w.triggers.add(trigger) + w.actions.add(action) + w.save() + + # create 10 docs with half having the document type + for i in range(10): + doc = Document.objects.create( + title=f"sample test {i}", + checksum=f"checksum{i}", + correspondent=self.c, + original_filename=f"sample_{i}.pdf", + document_type=self.dt if i % 2 == 0 else None, + ) + doc.tags.set([self.t1]) + doc.save() + + documents = Document.objects.all() + filtered_docs = prefilter_documents_by_workflowtrigger( + documents, + trigger, + ) + self.assertEqual(filtered_docs.count(), 5) + def test_workflow_enabled_disabled(self): trigger = WorkflowTrigger.objects.create( type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,