Performance: pre-filter document list in scheduled workflow checks (#10031)

This commit is contained in:
shamoon 2025-06-03 14:47:29 -07:00 committed by GitHub
parent 31351c5f5c
commit 422bffe1a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 94 additions and 0 deletions

View File

@ -3,6 +3,7 @@ from __future__ import annotations
import logging import logging
import re import re
from fnmatch import fnmatch from fnmatch import fnmatch
from fnmatch import translate as fnmatch_translate
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
@ -18,6 +19,8 @@ from documents.models import WorkflowTrigger
from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import get_objects_for_user_owner_aware
if TYPE_CHECKING: if TYPE_CHECKING:
from django.db.models import QuerySet
from documents.classifier import DocumentClassifier from documents.classifier import DocumentClassifier
logger = logging.getLogger("paperless.matching") logger = logging.getLogger("paperless.matching")
@ -389,6 +392,40 @@ def existing_document_matches_workflow(
return (trigger_matched, reason) return (trigger_matched, reason)
def prefilter_documents_by_workflowtrigger(
documents: QuerySet[Document],
trigger: WorkflowTrigger,
) -> QuerySet[Document]:
"""
To prevent scheduled workflows checking every document, we prefilter the
documents by the workflow trigger filters. This is done before e.g.
document_matches_workflow in run_workflows
"""
if trigger.filter_has_tags.all().count() > 0:
documents = documents.filter(
tags__in=trigger.filter_has_tags.all(),
).distinct()
if trigger.filter_has_correspondent is not None:
documents = documents.filter(
correspondent=trigger.filter_has_correspondent,
)
if trigger.filter_has_document_type is not None:
documents = documents.filter(
document_type=trigger.filter_has_document_type,
)
if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
# the true fnmatch will actually run later so we just want a loose filter here
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
regex = f"(?i){regex}"
documents = documents.filter(original_filename__regex=regex)
return documents
def document_matches_workflow( def document_matches_workflow(
document: ConsumableDocument | Document, document: ConsumableDocument | Document,
workflow: Workflow, workflow: Workflow,

View File

@ -33,6 +33,7 @@ from documents.data_models import DocumentMetadataOverrides
from documents.double_sided import CollatePlugin from documents.double_sided import CollatePlugin
from documents.file_handling import create_source_path_directory from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename from documents.file_handling import generate_unique_filename
from documents.matching import prefilter_documents_by_workflowtrigger
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import CustomFieldInstance from documents.models import CustomFieldInstance
from documents.models import Document from documents.models import Document
@ -473,6 +474,12 @@ def check_scheduled_workflows():
documents = Document.objects.filter(id__in=matched_ids) documents = Document.objects.filter(id__in=matched_ids)
if documents.count() > 0:
documents = prefilter_documents_by_workflowtrigger(
documents,
trigger,
)
if documents.count() > 0: if documents.count() > 0:
logger.debug( logger.debug(
f"Found {documents.count()} documents for trigger {trigger}", f"Found {documents.count()} documents for trigger {trigger}",

View File

@ -25,6 +25,7 @@ from documents import tasks
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
from documents.data_models import DocumentSource from documents.data_models import DocumentSource
from documents.matching import document_matches_workflow from documents.matching import document_matches_workflow
from documents.matching import prefilter_documents_by_workflowtrigger
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import CustomField from documents.models import CustomField
from documents.models import CustomFieldInstance from documents.models import CustomFieldInstance
@ -1711,6 +1712,55 @@ class TestWorkflows(
doc2.refresh_from_db() doc2.refresh_from_db()
self.assertIsNone(doc2.owner) # has not triggered yet self.assertIsNone(doc2.owner) # has not triggered yet
def test_workflow_scheduled_filters_queryset(self):
"""
GIVEN:
- Existing workflow with scheduled trigger
WHEN:
- Workflows run and matching documents are found
THEN:
- prefilter_documents_by_workflowtrigger appropriately filters
"""
trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED,
schedule_offset_days=-7,
schedule_date_field=WorkflowTrigger.ScheduleDateField.CREATED,
filter_filename="*sample*",
filter_has_document_type=self.dt,
filter_has_correspondent=self.c,
)
trigger.filter_has_tags.set([self.t1])
trigger.save()
action = WorkflowAction.objects.create(
assign_owner=self.user2,
)
w = Workflow.objects.create(
name="Workflow 1",
order=0,
)
w.triggers.add(trigger)
w.actions.add(action)
w.save()
# create 10 docs with half having the document type
for i in range(10):
doc = Document.objects.create(
title=f"sample test {i}",
checksum=f"checksum{i}",
correspondent=self.c,
original_filename=f"sample_{i}.pdf",
document_type=self.dt if i % 2 == 0 else None,
)
doc.tags.set([self.t1])
doc.save()
documents = Document.objects.all()
filtered_docs = prefilter_documents_by_workflowtrigger(
documents,
trigger,
)
self.assertEqual(filtered_docs.count(), 5)
def test_workflow_enabled_disabled(self): def test_workflow_enabled_disabled(self):
trigger = WorkflowTrigger.objects.create( trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED, type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,