Optimize tag/custom-field counts with subqueries

This commit is contained in:
shamoon
2026-01-30 00:12:44 -08:00
parent 9962f3d0a3
commit 14440b9bc8
2 changed files with 130 additions and 41 deletions

View File

@@ -2,10 +2,17 @@ from django.contrib.auth.models import Group
from django.contrib.auth.models import Permission from django.contrib.auth.models import Permission
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType from django.contrib.contenttypes.models import ContentType
from django.db.models import Count
from django.db.models import IntegerField
from django.db.models import OuterRef
from django.db.models import Q from django.db.models import Q
from django.db.models import QuerySet from django.db.models import QuerySet
from django.db.models import Subquery
from django.db.models.functions import Cast
from django.db.models.functions import Coalesce
from guardian.core import ObjectPermissionChecker from guardian.core import ObjectPermissionChecker
from guardian.models import GroupObjectPermission from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
from guardian.shortcuts import assign_perm from guardian.shortcuts import assign_perm
from guardian.shortcuts import get_objects_for_user from guardian.shortcuts import get_objects_for_user
from guardian.shortcuts import get_users_with_perms from guardian.shortcuts import get_users_with_perms
@@ -129,24 +136,90 @@ def set_permissions_for_object(permissions: dict, object, *, merge: bool = False
) )
def get_document_count_filter_for_user(user): def _permitted_document_ids(user):
""" """
Return the Q object used to filter document counts for the given user. Return a queryset of document IDs the user may view, limited to non-deleted
documents. This intentionally avoids ``get_objects_for_user`` to keep the
subquery small and index-friendly.
""" """
base_docs = Document.objects.filter(deleted_at__isnull=True)
if user is None or not getattr(user, "is_authenticated", False): if user is None or not getattr(user, "is_authenticated", False):
return Q(documents__deleted_at__isnull=True, documents__owner__isnull=True) return base_docs.filter(owner__isnull=True).values_list("id", flat=True)
if getattr(user, "is_superuser", False): if getattr(user, "is_superuser", False):
return Q(documents__deleted_at__isnull=True) return base_docs.values_list("id", flat=True)
return Q(
documents__deleted_at__isnull=True, document_ct = ContentType.objects.get_for_model(Document)
documents__id__in=get_objects_for_user_owner_aware( perm_filter = {
user, "permission__codename": "view_document",
"documents.view_document", "permission__content_type": document_ct,
Document, }
).values_list("id", flat=True),
user_perm_docs = (
UserObjectPermission.objects.filter(user=user, **perm_filter)
.annotate(object_pk_int=Cast("object_pk", IntegerField()))
.values_list("object_pk_int", flat=True)
) )
group_perm_docs = (
GroupObjectPermission.objects.filter(group__user=user, **perm_filter)
.annotate(object_pk_int=Cast("object_pk", IntegerField()))
.values_list("object_pk_int", flat=True)
)
permitted_documents = user_perm_docs.union(group_perm_docs)
return base_docs.filter(
Q(owner=user) | Q(owner__isnull=True) | Q(id__in=permitted_documents),
).values_list("id", flat=True)
def get_document_count_filter_for_user(user, *, relation_prefix: str = "documents"):
"""
Return the Q object used to filter document counts for the given user.
The filter is expressed as an ``id__in`` against a small subquery of permitted
document IDs to keep the generated SQL simple and avoid large OR clauses.
"""
id_key = f"{relation_prefix}__id__in"
permitted_ids = _permitted_document_ids(user)
return Q(**{id_key: permitted_ids})
def annotate_document_count_for_related_queryset(
queryset,
through_model,
source_field: str,
target_field: str,
user=None,
):
"""
Annotate a queryset with permissions-aware document counts using a subquery
against a relation table.
Args:
queryset: base queryset to annotate (must contain pk)
through_model: model representing the relation (e.g., Document.tags.through
or CustomFieldInstance)
source_field: field on the relation pointing back to queryset pk
target_field: field on the relation pointing to Document id
user: the user for whom to filter permitted document ids
"""
permitted_ids = _permitted_document_ids(user)
counts = (
through_model.objects.filter(
**{source_field: OuterRef("pk"), f"{target_field}__in": permitted_ids},
)
.values(source_field)
.annotate(c=Count(target_field))
.values("c")
)
return queryset.annotate(document_count=Coalesce(Subquery(counts[:1]), 0))
def get_objects_for_user_owner_aware(user, perms, Model) -> QuerySet: def get_objects_for_user_owner_aware(user, perms, Model) -> QuerySet:
objects_owned = Model.objects.filter(owner=user) objects_owned = Model.objects.filter(owner=user)

View File

@@ -32,7 +32,6 @@ from django.db.models import Count
from django.db.models import IntegerField from django.db.models import IntegerField
from django.db.models import Max from django.db.models import Max
from django.db.models import Model from django.db.models import Model
from django.db.models import Q
from django.db.models import Sum from django.db.models import Sum
from django.db.models import When from django.db.models import When
from django.db.models.functions import Length from django.db.models.functions import Length
@@ -128,6 +127,7 @@ from documents.matching import match_storage_paths
from documents.matching import match_tags from documents.matching import match_tags
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import CustomField from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import Note from documents.models import Note
@@ -147,6 +147,7 @@ from documents.permissions import PaperlessAdminPermissions
from documents.permissions import PaperlessNotePermissions from documents.permissions import PaperlessNotePermissions
from documents.permissions import PaperlessObjectPermissions from documents.permissions import PaperlessObjectPermissions
from documents.permissions import ViewDocumentsPermissions from documents.permissions import ViewDocumentsPermissions
from documents.permissions import annotate_document_count_for_related_queryset
from documents.permissions import get_document_count_filter_for_user from documents.permissions import get_document_count_filter_for_user
from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import get_objects_for_user_owner_aware
from documents.permissions import has_perms_owner_aware from documents.permissions import has_perms_owner_aware
@@ -429,6 +430,26 @@ class TagViewSet(ModelViewSet, PermissionsAwareDocumentCountMixin):
Lower("name"), Lower("name"),
) )
def _with_document_counts(self, queryset):
"""
Annotate tags with a permissions-aware document_count using only the
through table plus a compact subquery of permitted document IDs. This
keeps PostgreSQL from evaluating large OR clauses against the documents
table for every tag.
"""
user = getattr(self.request, "user", None)
return annotate_document_count_for_related_queryset(
queryset,
through_model=Document.tags.through,
source_field="tag_id",
target_field="document_id",
user=user,
)
def get_queryset(self):
return self._with_document_counts(self.queryset.all())
def get_serializer_class(self, *args, **kwargs): def get_serializer_class(self, *args, **kwargs):
if int(self.request.version) == 1: if int(self.request.version) == 1:
return TagSerializerVersion1 return TagSerializerVersion1
@@ -466,12 +487,12 @@ class TagViewSet(ModelViewSet, PermissionsAwareDocumentCountMixin):
descendant_pks = {pk for tag in all_tags for pk in tag.get_descendants_pks()} descendant_pks = {pk for tag in all_tags for pk in tag.get_descendants_pks()}
if descendant_pks: if descendant_pks:
filter_q = self.get_document_count_filter()
children_source = list( children_source = list(
Tag.objects.filter(pk__in=descendant_pks | {t.pk for t in all_tags}) self._with_document_counts(
.select_related("owner") Tag.objects.filter(pk__in=descendant_pks | {t.pk for t in all_tags})
.annotate(document_count=Count("documents", filter=filter_q)) .select_related("owner")
.order_by(*ordering), .order_by(*ordering),
),
) )
else: else:
children_source = all_tags children_source = all_tags
@@ -2874,31 +2895,26 @@ class CustomFieldViewSet(ModelViewSet):
queryset = CustomField.objects.all().order_by("-created") queryset = CustomField.objects.all().order_by("-created")
def _with_document_counts(self, queryset):
"""
Annotate custom fields with permissions-aware document_count by
counting CustomFieldInstance rows whose document is viewable by the
current user. Uses a correlated subquery to avoid large joins that
previously caused timeouts on big datasets.
"""
user = getattr(self.request, "user", None)
return annotate_document_count_for_related_queryset(
queryset,
through_model=CustomFieldInstance,
source_field="field_id",
target_field="document_id",
user=user,
)
def get_queryset(self): def get_queryset(self):
filter = ( base_qs = super().get_queryset()
Q(fields__document__deleted_at__isnull=True) return self._with_document_counts(base_qs)
if self.request.user is None or self.request.user.is_superuser
else (
Q(
fields__document__deleted_at__isnull=True,
fields__document__id__in=get_objects_for_user_owner_aware(
self.request.user,
"documents.view_document",
Document,
).values_list("id", flat=True),
)
)
)
return (
super()
.get_queryset()
.annotate(
document_count=Count(
"fields",
filter=filter,
),
)
)
@extend_schema_view( @extend_schema_view(