Compare commits

..

3 Commits

Author SHA1 Message Date
shamoon
6f4cdb3875 A ChatGPT script for testing 2025-12-15 14:31:59 -08:00
shamoon
02a63144cf Optimize tag children retrieval 2025-12-15 14:31:58 -08:00
shamoon
917e3f3c60 Run Tag tree updates once per transaction 2025-12-15 14:31:58 -08:00
9 changed files with 282 additions and 82 deletions

View File

@@ -275,12 +275,8 @@ jobs:
tests-frontend-e2e: tests-frontend-e2e:
name: "Frontend E2E Tests (Node ${{ matrix.node-version }} - ${{ matrix.shard-index }}/${{ matrix.shard-count }})" name: "Frontend E2E Tests (Node ${{ matrix.node-version }} - ${{ matrix.shard-index }}/${{ matrix.shard-count }})"
runs-on: ubuntu-24.04 runs-on: ubuntu-24.04
container: mcr.microsoft.com/playwright:v1.57.0-noble
needs: needs:
- install-frontend-dependencies - install-frontend-dependencies
env:
PLAYWRIGHT_BROWSERS_PATH: /ms-playwright
PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: 1
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@@ -309,8 +305,19 @@ jobs:
key: ${{ runner.os }}-frontenddeps-${{ hashFiles('src-ui/pnpm-lock.yaml') }} key: ${{ runner.os }}-frontenddeps-${{ hashFiles('src-ui/pnpm-lock.yaml') }}
- name: Re-link Angular cli - name: Re-link Angular cli
run: cd src-ui && pnpm link @angular/cli run: cd src-ui && pnpm link @angular/cli
- name: Cache Playwright browsers
uses: actions/cache@v4
with:
path: ~/.cache/ms-playwright
key: ${{ runner.os }}-playwright-${{ hashFiles('src-ui/pnpm-lock.yaml') }}
restore-keys: |
${{ runner.os }}-playwright-
- name: Install Playwright system dependencies
run: npx playwright install-deps
- name: Install dependencies - name: Install dependencies
run: cd src-ui && pnpm install --no-frozen-lockfile run: cd src-ui && pnpm install --no-frozen-lockfile
- name: Install Playwright
run: cd src-ui && pnpm exec playwright install
- name: Run Playwright e2e tests - name: Run Playwright e2e tests
run: cd src-ui && pnpm exec playwright test --shard ${{ matrix.shard-index }}/${{ matrix.shard-count }} run: cd src-ui && pnpm exec playwright test --shard ${{ matrix.shard-index }}/${{ matrix.shard-count }}
frontend-bundle-analysis: frontend-bundle-analysis:

139
scripts/tag_perf_probe.py Normal file
View File

@@ -0,0 +1,139 @@
# noqa: INP001
"""
Ad-hoc script to gauge Tag + treenode performance locally.
It bootstraps a fresh SQLite DB in a temp folder (or PAPERLESS_DATA_DIR),
uses locmem cache/redis to avoid external services, creates synthetic tags,
and measures:
- creation time
- query count and wall time for the Tag list view
Usage:
PAPERLESS_DEBUG=1 PAPERLESS_REDIS=locmem:// PYTHONPATH=src \
PAPERLESS_DATA_DIR=/tmp/paperless-tags-probe \
.venv/bin/python scripts/tag_perf_probe.py
"""
import os
import sys
import time
from collections.abc import Iterable
from contextlib import contextmanager
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
os.environ.setdefault("PAPERLESS_DEBUG", "1")
os.environ.setdefault("PAPERLESS_REDIS", "locmem://")
os.environ.setdefault("PYTHONPATH", "src")
import django
django.setup()
from django.contrib.auth import get_user_model # noqa: E402
from django.core.management import call_command # noqa: E402
from django.db import connection # noqa: E402
from django.test.client import RequestFactory # noqa: E402
from rest_framework.test import force_authenticate # noqa: E402
from treenode.signals import no_signals # noqa: E402
from documents.models import Tag # noqa: E402
from documents.views import TagViewSet # noqa: E402
User = get_user_model()
@contextmanager
def count_queries():
total = 0
def wrapper(execute, sql, params, many, context):
nonlocal total
total += 1
return execute(sql, params, many, context)
with connection.execute_wrapper(wrapper):
yield lambda: total
def measure_list(tag_count: int, user) -> tuple[int, float]:
"""Render Tag list with page_size=tag_count and return (queries, seconds)."""
rf = RequestFactory()
view = TagViewSet.as_view({"get": "list"})
request = rf.get("/api/tags/", {"page_size": tag_count})
force_authenticate(request, user=user)
with count_queries() as get_count:
start = time.perf_counter()
response = view(request)
response.render()
elapsed = time.perf_counter() - start
total_queries = get_count()
return total_queries, elapsed
def bulk_create_tags(count: int, parents: Iterable[Tag] | None = None) -> None:
"""Create tags; when parents provided, create one child per parent."""
if parents is None:
Tag.objects.bulk_create([Tag(name=f"Flat {i}") for i in range(count)])
return
children = []
for p in parents:
children.append(Tag(name=f"Child {p.id}", tn_parent=p))
Tag.objects.bulk_create(children)
def run():
# Ensure tables exist when pointing at a fresh DATA_DIR.
call_command("migrate", interactive=False, verbosity=0)
user, _ = User.objects.get_or_create(
username="admin",
defaults={"is_superuser": True, "is_staff": True},
)
# Flat scenario
Tag.objects.all().delete()
start = time.perf_counter()
bulk_create_tags(200)
flat_create = time.perf_counter() - start
q, t = measure_list(tag_count=200, user=user)
print(f"Flat create 200 -> {flat_create:.2f}s; list -> {q} queries, {t:.2f}s") # noqa: T201
# Nested scenario (parents + 2 children each => 600 total)
Tag.objects.all().delete()
start = time.perf_counter()
with no_signals(): # avoid per-save tree rebuild; rebuild once
parents = Tag.objects.bulk_create([Tag(name=f"Parent {i}") for i in range(200)])
children = []
for p in parents:
children.extend(
Tag(name=f"Child {p.id}-{j}", tn_parent=p) for j in range(2)
)
Tag.objects.bulk_create(children)
Tag.update_tree()
nested_create = time.perf_counter() - start
q, t = measure_list(tag_count=600, user=user)
print(f"Nested create 600 -> {nested_create:.2f}s; list -> {q} queries, {t:.2f}s") # noqa: T201
# Larger nested scenario (1 child per parent, 3000 total)
Tag.objects.all().delete()
start = time.perf_counter()
with no_signals():
parents = Tag.objects.bulk_create(
[Tag(name=f"Parent {i}") for i in range(1500)],
)
bulk_create_tags(0, parents=parents)
Tag.update_tree()
big_create = time.perf_counter() - start
q, t = measure_list(tag_count=3000, user=user)
print(f"Nested create 3000 -> {big_create:.2f}s; list -> {q} queries, {t:.2f}s") # noqa: T201
if __name__ == "__main__":
if "runserver" in sys.argv:
print("Run directly: .venv/bin/python scripts/tag_perf_probe.py") # noqa: T201
sys.exit(1)
run()

View File

@@ -1,5 +1,9 @@
from django.apps import AppConfig from django.apps import AppConfig
from django.db.models.signals import post_delete
from django.db.models.signals import post_save
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from treenode.signals import post_delete_treenode
from treenode.signals import post_save_treenode
class DocumentsConfig(AppConfig): class DocumentsConfig(AppConfig):
@@ -8,12 +12,14 @@ class DocumentsConfig(AppConfig):
verbose_name = _("Documents") verbose_name = _("Documents")
def ready(self): def ready(self):
from documents.models import Tag
from documents.signals import document_consumption_finished from documents.signals import document_consumption_finished
from documents.signals import document_updated from documents.signals import document_updated
from documents.signals.handlers import add_inbox_tags from documents.signals.handlers import add_inbox_tags
from documents.signals.handlers import add_to_index from documents.signals.handlers import add_to_index
from documents.signals.handlers import run_workflows_added from documents.signals.handlers import run_workflows_added
from documents.signals.handlers import run_workflows_updated from documents.signals.handlers import run_workflows_updated
from documents.signals.handlers import schedule_tag_tree_update
from documents.signals.handlers import set_correspondent from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_document_type from documents.signals.handlers import set_document_type
from documents.signals.handlers import set_storage_path from documents.signals.handlers import set_storage_path
@@ -28,6 +34,29 @@ class DocumentsConfig(AppConfig):
document_consumption_finished.connect(run_workflows_added) document_consumption_finished.connect(run_workflows_added)
document_updated.connect(run_workflows_updated) document_updated.connect(run_workflows_updated)
# treenode updates the entire tree on every save/delete via hooks
# so disconnect for Tags and run once-per-transaction.
post_save.disconnect(
post_save_treenode,
sender=Tag,
dispatch_uid="post_save_treenode",
)
post_delete.disconnect(
post_delete_treenode,
sender=Tag,
dispatch_uid="post_delete_treenode",
)
post_save.connect(
schedule_tag_tree_update,
sender=Tag,
dispatch_uid="paperless_tag_mark_dirty_save",
)
post_delete.connect(
schedule_tag_tree_update,
sender=Tag,
dispatch_uid="paperless_tag_mark_dirty_delete",
)
import documents.schema # noqa: F401 import documents.schema # noqa: F401
AppConfig.ready(self) AppConfig.ready(self)

View File

@@ -10,7 +10,6 @@ from datetime import time
from datetime import timedelta from datetime import timedelta
from datetime import timezone from datetime import timezone
from shutil import rmtree from shutil import rmtree
from time import sleep
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from typing import Literal from typing import Literal
@@ -33,7 +32,6 @@ from whoosh.highlight import HtmlFormatter
from whoosh.idsets import BitSet from whoosh.idsets import BitSet
from whoosh.idsets import DocIdSet from whoosh.idsets import DocIdSet
from whoosh.index import FileIndex from whoosh.index import FileIndex
from whoosh.index import LockError
from whoosh.index import create_in from whoosh.index import create_in
from whoosh.index import exists_in from whoosh.index import exists_in
from whoosh.index import open_dir from whoosh.index import open_dir
@@ -99,33 +97,11 @@ def get_schema() -> Schema:
def open_index(*, recreate=False) -> FileIndex: def open_index(*, recreate=False) -> FileIndex:
transient_exceptions = (FileNotFoundError, LockError)
max_retries = 3
retry_delay = 0.1
for attempt in range(max_retries + 1):
try: try:
if exists_in(settings.INDEX_DIR) and not recreate: if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR, schema=get_schema()) return open_dir(settings.INDEX_DIR, schema=get_schema())
break
except transient_exceptions as exc:
is_last_attempt = attempt == max_retries or recreate
if is_last_attempt:
logger.exception(
"Error while opening the index after retries, recreating.",
)
break
logger.warning(
"Transient error while opening the index (attempt %s/%s): %s. Retrying.",
attempt + 1,
max_retries + 1,
exc,
)
sleep(retry_delay)
except Exception: except Exception:
logger.exception("Error while opening the index, recreating.") logger.exception("Error while opening the index, recreating.")
break
# create_in doesn't handle corrupted indexes very well, remove the directory entirely first # create_in doesn't handle corrupted indexes very well, remove the directory entirely first
if settings.INDEX_DIR.is_dir(): if settings.INDEX_DIR.is_dir():

View File

@@ -578,6 +578,10 @@ class TagSerializer(MatchingModelSerializer, OwnedObjectSerializer):
), ),
) )
def get_children(self, obj): def get_children(self, obj):
children_map = self.context.get("children_map")
if children_map is not None:
children = children_map.get(obj.pk, [])
else:
filter_q = self.context.get("document_count_filter") filter_q = self.context.get("document_count_filter")
request = self.context.get("request") request = self.context.get("request")
if filter_q is None: if filter_q is None:
@@ -585,7 +589,7 @@ class TagSerializer(MatchingModelSerializer, OwnedObjectSerializer):
filter_q = get_document_count_filter_for_user(user) filter_q = get_document_count_filter_for_user(user)
self.context["document_count_filter"] = filter_q self.context["document_count_filter"] = filter_q
children_queryset = ( children = (
obj.get_children_queryset() obj.get_children_queryset()
.select_related("owner") .select_related("owner")
.annotate(document_count=Count("documents", filter=filter_q)) .annotate(document_count=Count("documents", filter=filter_q))
@@ -593,15 +597,15 @@ class TagSerializer(MatchingModelSerializer, OwnedObjectSerializer):
view = self.context.get("view") view = self.context.get("view")
ordering = ( ordering = (
OrderingFilter().get_ordering(request, children_queryset, view) OrderingFilter().get_ordering(request, children, view)
if request and view if request and view
else None else None
) )
ordering = ordering or (Lower("name"),) ordering = ordering or (Lower("name"),)
children_queryset = children_queryset.order_by(*ordering) children = children.order_by(*ordering)
serializer = TagSerializer( serializer = TagSerializer(
children_queryset, children,
many=True, many=True,
user=self.user, user=self.user,
full_perms=self.full_perms, full_perms=self.full_perms,

View File

@@ -19,6 +19,7 @@ from django.db import DatabaseError
from django.db import close_old_connections from django.db import close_old_connections
from django.db import connections from django.db import connections
from django.db import models from django.db import models
from django.db import transaction
from django.db.models import Q from django.db.models import Q
from django.dispatch import receiver from django.dispatch import receiver
from django.utils import timezone from django.utils import timezone
@@ -60,6 +61,8 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.handlers") logger = logging.getLogger("paperless.handlers")
_tag_tree_update_scheduled = False
def add_inbox_tags(sender, document: Document, logging_group=None, **kwargs): def add_inbox_tags(sender, document: Document, logging_group=None, **kwargs):
if document.owner is not None: if document.owner is not None:
@@ -944,3 +947,26 @@ def close_connection_pool_on_worker_init(**kwargs):
for conn in connections.all(initialized_only=True): for conn in connections.all(initialized_only=True):
if conn.alias == "default" and hasattr(conn, "pool") and conn.pool: if conn.alias == "default" and hasattr(conn, "pool") and conn.pool:
conn.close_pool() conn.close_pool()
def schedule_tag_tree_update(**_kwargs):
"""
Schedule a single Tag.update_tree() at transaction commit.
Treenode's default post_save hooks rebuild the entire tree on every save,
which is very slow for large tag sets so collapse to one update per
transaction.
"""
global _tag_tree_update_scheduled
if _tag_tree_update_scheduled:
return
_tag_tree_update_scheduled = True
def _run():
global _tag_tree_update_scheduled
try:
Tag.update_tree()
finally:
_tag_tree_update_scheduled = False
transaction.on_commit(_run)

View File

@@ -1,7 +1,6 @@
from datetime import datetime from datetime import datetime
from unittest import mock from unittest import mock
from django.conf import settings
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.test import SimpleTestCase from django.test import SimpleTestCase
from django.test import TestCase from django.test import TestCase
@@ -252,31 +251,3 @@ class TestRewriteNaturalDateKeywords(SimpleTestCase):
result = self._rewrite_with_now("added:today", fixed_now) result = self._rewrite_with_now("added:today", fixed_now)
# Should convert to UTC properly # Should convert to UTC properly
self.assertIn("added:[20250719", result) self.assertIn("added:[20250719", result)
class TestIndexResilience(DirectoriesMixin, SimpleTestCase):
def test_transient_missing_segment_does_not_force_recreate(self):
file_marker = settings.INDEX_DIR / "file_marker.txt"
file_marker.write_text("keep")
expected_index = object()
with (
mock.patch("documents.index.exists_in", return_value=True),
mock.patch(
"documents.index.open_dir",
side_effect=[FileNotFoundError("missing"), expected_index],
) as mock_open_dir,
mock.patch(
"documents.index.create_in",
) as mock_create_in,
mock.patch(
"documents.index.rmtree",
) as mock_rmtree,
):
ix = index.open_index()
self.assertIs(ix, expected_index)
self.assertGreaterEqual(mock_open_dir.call_count, 2)
mock_rmtree.assert_not_called()
mock_create_in.assert_not_called()
self.assertEqual(file_marker.read_text(), "keep")

View File

@@ -250,3 +250,16 @@ class TestTagHierarchy(APITestCase):
row for row in response.data["results"] if row["id"] == self.parent.pk row for row in response.data["results"] if row["id"] == self.parent.pk
) )
assert any(child["id"] == self.child.pk for child in parent_entry["children"]) assert any(child["id"] == self.child.pk for child in parent_entry["children"])
def test_tag_tree_deferred_update_runs_on_commit(self):
from django.db import transaction
# Create tags inside an explicit transaction and commit.
with transaction.atomic():
parent = Tag.objects.create(name="Parent 2")
child = Tag.objects.create(name="Child 2", tn_parent=parent)
# After commit, tn_* fields should be populated.
parent.refresh_from_db()
child.refresh_from_db()
assert parent.tn_children_count == 1
assert child.tn_ancestors_count == 1

View File

@@ -448,8 +448,43 @@ class TagViewSet(ModelViewSet, PermissionsAwareDocumentCountMixin):
def get_serializer_context(self): def get_serializer_context(self):
context = super().get_serializer_context() context = super().get_serializer_context()
context["document_count_filter"] = self.get_document_count_filter() context["document_count_filter"] = self.get_document_count_filter()
if hasattr(self, "_children_map"):
context["children_map"] = self._children_map
return context return context
def list(self, request, *args, **kwargs):
"""
Build a children map once to avoid per-parent queries in the serializer.
"""
queryset = self.filter_queryset(self.get_queryset())
ordering = OrderingFilter().get_ordering(request, queryset, self) or (
Lower("name"),
)
queryset = queryset.order_by(*ordering)
all_tags = list(queryset)
descendant_pks = {pk for tag in all_tags for pk in tag.get_descendants_pks()}
if descendant_pks:
filter_q = self.get_document_count_filter()
children_source = (
Tag.objects.filter(pk__in=descendant_pks | {t.pk for t in all_tags})
.select_related("owner")
.annotate(document_count=Count("documents", filter=filter_q))
.order_by(*ordering)
)
else:
children_source = all_tags
children_map = {}
for tag in children_source:
children_map.setdefault(tag.tn_parent_id, []).append(tag)
self._children_map = children_map
page = self.paginate_queryset(queryset)
serializer = self.get_serializer(page, many=True)
return self.get_paginated_response(serializer.data)
def perform_update(self, serializer): def perform_update(self, serializer):
old_parent = self.get_object().get_parent() old_parent = self.get_object().get_parent()
tag = serializer.save() tag = serializer.save()