Enhancement: try filtering large querysets for scheduled workflow

This commit is contained in:
shamoon 2025-05-24 08:24:34 -07:00
parent eb07876657
commit 882b15378a
No known key found for this signature in database
2 changed files with 43 additions and 0 deletions

View File

@ -18,6 +18,8 @@ from documents.models import WorkflowTrigger
from documents.permissions import get_objects_for_user_owner_aware
if TYPE_CHECKING:
from django.db.models import QuerySet
from documents.classifier import DocumentClassifier
logger = logging.getLogger("paperless.matching")
@ -389,6 +391,39 @@ def existing_document_matches_workflow(
return (trigger_matched, reason)
def filter_documents_by_workflowtrigger_criteria(
documents: QuerySet[Document],
trigger: WorkflowTrigger,
) -> QuerySet[Document]:
"""
Filters the documents queryset by the criteria defined in the workflow.
Returns a filtered queryset of documents that match the trigger's criteria.
"""
from django.db.models import Q
if trigger.filter_has_tags.all().count() > 0:
documents = documents.filter(
Q(tags__in=trigger.filter_has_tags.all()) | Q(tags__isnull=True),
).distinct()
if trigger.filter_has_correspondent is not None:
documents = documents.filter(
correspondent=trigger.filter_has_correspondent,
)
if trigger.filter_has_document_type is not None:
documents = documents.filter(
document_type=trigger.filter_has_document_type,
)
if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
documents = documents.filter(
original_filename__icontains=trigger.filter_filename,
)
return documents
def document_matches_workflow(
document: ConsumableDocument | Document,
workflow: Workflow,

View File

@ -32,6 +32,7 @@ from documents.data_models import DocumentMetadataOverrides
from documents.double_sided import CollatePlugin
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.matching import filter_documents_by_workflowtrigger_criteria
from documents.models import Correspondent
from documents.models import CustomFieldInstance
from documents.models import Document
@ -459,6 +460,13 @@ def check_scheduled_workflows():
documents = Document.objects.filter(id__in=matched_ids)
# Workflows initially matched against one document at a time, so speed things up
# by filtering documents by the trigger criteria
documents = filter_documents_by_workflowtrigger_criteria(
documents,
trigger,
)
if documents.count() > 0:
logger.debug(
f"Found {documents.count()} documents for trigger {trigger}",