mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-06-06 14:07:26 -05:00
Performance: pre-filter document list in scheduled workflow checks (#10031)
This commit is contained in:
parent
31351c5f5c
commit
422bffe1a6
@ -3,6 +3,7 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from fnmatch import fnmatch
|
from fnmatch import fnmatch
|
||||||
|
from fnmatch import translate as fnmatch_translate
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from documents.data_models import ConsumableDocument
|
from documents.data_models import ConsumableDocument
|
||||||
@ -18,6 +19,8 @@ from documents.models import WorkflowTrigger
|
|||||||
from documents.permissions import get_objects_for_user_owner_aware
|
from documents.permissions import get_objects_for_user_owner_aware
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
from django.db.models import QuerySet
|
||||||
|
|
||||||
from documents.classifier import DocumentClassifier
|
from documents.classifier import DocumentClassifier
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.matching")
|
logger = logging.getLogger("paperless.matching")
|
||||||
@ -389,6 +392,40 @@ def existing_document_matches_workflow(
|
|||||||
return (trigger_matched, reason)
|
return (trigger_matched, reason)
|
||||||
|
|
||||||
|
|
||||||
|
def prefilter_documents_by_workflowtrigger(
|
||||||
|
documents: QuerySet[Document],
|
||||||
|
trigger: WorkflowTrigger,
|
||||||
|
) -> QuerySet[Document]:
|
||||||
|
"""
|
||||||
|
To prevent scheduled workflows checking every document, we prefilter the
|
||||||
|
documents by the workflow trigger filters. This is done before e.g.
|
||||||
|
document_matches_workflow in run_workflows
|
||||||
|
"""
|
||||||
|
|
||||||
|
if trigger.filter_has_tags.all().count() > 0:
|
||||||
|
documents = documents.filter(
|
||||||
|
tags__in=trigger.filter_has_tags.all(),
|
||||||
|
).distinct()
|
||||||
|
|
||||||
|
if trigger.filter_has_correspondent is not None:
|
||||||
|
documents = documents.filter(
|
||||||
|
correspondent=trigger.filter_has_correspondent,
|
||||||
|
)
|
||||||
|
|
||||||
|
if trigger.filter_has_document_type is not None:
|
||||||
|
documents = documents.filter(
|
||||||
|
document_type=trigger.filter_has_document_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
if trigger.filter_filename is not None and len(trigger.filter_filename) > 0:
|
||||||
|
# the true fnmatch will actually run later so we just want a loose filter here
|
||||||
|
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
|
||||||
|
regex = f"(?i){regex}"
|
||||||
|
documents = documents.filter(original_filename__regex=regex)
|
||||||
|
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
def document_matches_workflow(
|
def document_matches_workflow(
|
||||||
document: ConsumableDocument | Document,
|
document: ConsumableDocument | Document,
|
||||||
workflow: Workflow,
|
workflow: Workflow,
|
||||||
|
@ -33,6 +33,7 @@ from documents.data_models import DocumentMetadataOverrides
|
|||||||
from documents.double_sided import CollatePlugin
|
from documents.double_sided import CollatePlugin
|
||||||
from documents.file_handling import create_source_path_directory
|
from documents.file_handling import create_source_path_directory
|
||||||
from documents.file_handling import generate_unique_filename
|
from documents.file_handling import generate_unique_filename
|
||||||
|
from documents.matching import prefilter_documents_by_workflowtrigger
|
||||||
from documents.models import Correspondent
|
from documents.models import Correspondent
|
||||||
from documents.models import CustomFieldInstance
|
from documents.models import CustomFieldInstance
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
@ -473,6 +474,12 @@ def check_scheduled_workflows():
|
|||||||
|
|
||||||
documents = Document.objects.filter(id__in=matched_ids)
|
documents = Document.objects.filter(id__in=matched_ids)
|
||||||
|
|
||||||
|
if documents.count() > 0:
|
||||||
|
documents = prefilter_documents_by_workflowtrigger(
|
||||||
|
documents,
|
||||||
|
trigger,
|
||||||
|
)
|
||||||
|
|
||||||
if documents.count() > 0:
|
if documents.count() > 0:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Found {documents.count()} documents for trigger {trigger}",
|
f"Found {documents.count()} documents for trigger {trigger}",
|
||||||
|
@ -25,6 +25,7 @@ from documents import tasks
|
|||||||
from documents.data_models import ConsumableDocument
|
from documents.data_models import ConsumableDocument
|
||||||
from documents.data_models import DocumentSource
|
from documents.data_models import DocumentSource
|
||||||
from documents.matching import document_matches_workflow
|
from documents.matching import document_matches_workflow
|
||||||
|
from documents.matching import prefilter_documents_by_workflowtrigger
|
||||||
from documents.models import Correspondent
|
from documents.models import Correspondent
|
||||||
from documents.models import CustomField
|
from documents.models import CustomField
|
||||||
from documents.models import CustomFieldInstance
|
from documents.models import CustomFieldInstance
|
||||||
@ -1711,6 +1712,55 @@ class TestWorkflows(
|
|||||||
doc2.refresh_from_db()
|
doc2.refresh_from_db()
|
||||||
self.assertIsNone(doc2.owner) # has not triggered yet
|
self.assertIsNone(doc2.owner) # has not triggered yet
|
||||||
|
|
||||||
|
def test_workflow_scheduled_filters_queryset(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Existing workflow with scheduled trigger
|
||||||
|
WHEN:
|
||||||
|
- Workflows run and matching documents are found
|
||||||
|
THEN:
|
||||||
|
- prefilter_documents_by_workflowtrigger appropriately filters
|
||||||
|
"""
|
||||||
|
trigger = WorkflowTrigger.objects.create(
|
||||||
|
type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED,
|
||||||
|
schedule_offset_days=-7,
|
||||||
|
schedule_date_field=WorkflowTrigger.ScheduleDateField.CREATED,
|
||||||
|
filter_filename="*sample*",
|
||||||
|
filter_has_document_type=self.dt,
|
||||||
|
filter_has_correspondent=self.c,
|
||||||
|
)
|
||||||
|
trigger.filter_has_tags.set([self.t1])
|
||||||
|
trigger.save()
|
||||||
|
action = WorkflowAction.objects.create(
|
||||||
|
assign_owner=self.user2,
|
||||||
|
)
|
||||||
|
w = Workflow.objects.create(
|
||||||
|
name="Workflow 1",
|
||||||
|
order=0,
|
||||||
|
)
|
||||||
|
w.triggers.add(trigger)
|
||||||
|
w.actions.add(action)
|
||||||
|
w.save()
|
||||||
|
|
||||||
|
# create 10 docs with half having the document type
|
||||||
|
for i in range(10):
|
||||||
|
doc = Document.objects.create(
|
||||||
|
title=f"sample test {i}",
|
||||||
|
checksum=f"checksum{i}",
|
||||||
|
correspondent=self.c,
|
||||||
|
original_filename=f"sample_{i}.pdf",
|
||||||
|
document_type=self.dt if i % 2 == 0 else None,
|
||||||
|
)
|
||||||
|
doc.tags.set([self.t1])
|
||||||
|
doc.save()
|
||||||
|
|
||||||
|
documents = Document.objects.all()
|
||||||
|
filtered_docs = prefilter_documents_by_workflowtrigger(
|
||||||
|
documents,
|
||||||
|
trigger,
|
||||||
|
)
|
||||||
|
self.assertEqual(filtered_docs.count(), 5)
|
||||||
|
|
||||||
def test_workflow_enabled_disabled(self):
|
def test_workflow_enabled_disabled(self):
|
||||||
trigger = WorkflowTrigger.objects.create(
|
trigger = WorkflowTrigger.objects.create(
|
||||||
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user