some search index optimizations

This commit is contained in:
jonaswinkler 2021-02-15 13:26:36 +01:00
parent 56bd966c02
commit 8bf4241b16
6 changed files with 69 additions and 36 deletions

View File

@ -1,7 +1,5 @@
from django.contrib import admin from django.contrib import admin
from whoosh.writing import AsyncWriter
from . import index
from .models import Correspondent, Document, DocumentType, Tag, \ from .models import Correspondent, Document, DocumentType, Tag, \
SavedView, SavedViewFilterRule SavedView, SavedViewFilterRule
@ -84,17 +82,21 @@ class DocumentAdmin(admin.ModelAdmin):
created_.short_description = "Created" created_.short_description = "Created"
def delete_queryset(self, request, queryset): def delete_queryset(self, request, queryset):
ix = index.open_index() from documents import index
with AsyncWriter(ix) as writer:
with index.open_index_writer() as writer:
for o in queryset: for o in queryset:
index.remove_document(writer, o) index.remove_document(writer, o)
super(DocumentAdmin, self).delete_queryset(request, queryset) super(DocumentAdmin, self).delete_queryset(request, queryset)
def delete_model(self, request, obj): def delete_model(self, request, obj):
from documents import index
index.remove_document_from_index(obj) index.remove_document_from_index(obj)
super(DocumentAdmin, self).delete_model(request, obj) super(DocumentAdmin, self).delete_model(request, obj)
def save_model(self, request, obj, form, change): def save_model(self, request, obj, form, change):
from documents import index
index.add_or_update_document(obj) index.add_or_update_document(obj)
super(DocumentAdmin, self).save_model(request, obj, form, change) super(DocumentAdmin, self).save_model(request, obj, form, change)

View File

@ -2,9 +2,7 @@ import itertools
from django.db.models import Q from django.db.models import Q
from django_q.tasks import async_task from django_q.tasks import async_task
from whoosh.writing import AsyncWriter
from documents import index
from documents.models import Document, Correspondent, DocumentType from documents.models import Document, Correspondent, DocumentType
@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags):
def delete(doc_ids): def delete(doc_ids):
Document.objects.filter(id__in=doc_ids).delete() Document.objects.filter(id__in=doc_ids).delete()
ix = index.open_index() from documents import index
with AsyncWriter(ix) as writer:
with index.open_index_writer() as writer:
for id in doc_ids: for id in doc_ids:
index.remove_document_by_id(writer, id) index.remove_document_by_id(writer, id)

View File

@ -86,6 +86,22 @@ def open_index(recreate=False):
return create_in(settings.INDEX_DIR, get_schema()) return create_in(settings.INDEX_DIR, get_schema())
@contextmanager
def open_index_writer(ix=None, optimize=False):
if ix:
writer = AsyncWriter(ix)
else:
writer = AsyncWriter(open_index())
try:
yield writer
except Exception as e:
logger.exception(str(e))
writer.cancel()
finally:
writer.commit(optimize=optimize)
def update_document(writer, doc): def update_document(writer, doc):
tags = ",".join([t.name for t in doc.tags.all()]) tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document( writer.update_document(
@ -110,14 +126,12 @@ def remove_document_by_id(writer, doc_id):
def add_or_update_document(document): def add_or_update_document(document):
ix = open_index() with open_index_writer() as writer:
with AsyncWriter(ix) as writer:
update_document(writer, document) update_document(writer, document)
def remove_document_from_index(document): def remove_document_from_index(document):
ix = open_index() with open_index_writer() as writer:
with AsyncWriter(ix) as writer:
remove_document(writer, document) remove_document(writer, document)

View File

@ -11,7 +11,7 @@ from django.dispatch import receiver
from django.utils import timezone from django.utils import timezone
from filelock import FileLock from filelock import FileLock
from .. import index, matching from .. import matching
from ..file_handling import delete_empty_directories, \ from ..file_handling import delete_empty_directories, \
create_source_path_directory, \ create_source_path_directory, \
generate_unique_filename generate_unique_filename
@ -305,4 +305,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs):
def add_to_index(sender, document, **kwargs): def add_to_index(sender, document, **kwargs):
from documents import index
index.add_or_update_document(document) index.add_or_update_document(document)

View File

@ -4,6 +4,7 @@ from django.contrib.admin.sites import AdminSite
from django.test import TestCase from django.test import TestCase
from django.utils import timezone from django.utils import timezone
from documents import index
from documents.admin import DocumentAdmin from documents.admin import DocumentAdmin
from documents.models import Document from documents.models import Document
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
@ -11,37 +12,52 @@ from documents.tests.utils import DirectoriesMixin
class TestDocumentAdmin(DirectoriesMixin, TestCase): class TestDocumentAdmin(DirectoriesMixin, TestCase):
def get_document_from_index(self, doc):
ix = index.open_index()
with ix.searcher() as searcher:
return searcher.document(id=doc.id)
def setUp(self) -> None: def setUp(self) -> None:
super(TestDocumentAdmin, self).setUp() super(TestDocumentAdmin, self).setUp()
self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite()) self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite())
@mock.patch("documents.admin.index.add_or_update_document") def test_save_model(self):
def test_save_model(self, m):
doc = Document.objects.create(title="test") doc = Document.objects.create(title="test")
doc.title = "new title" doc.title = "new title"
self.doc_admin.save_model(None, doc, None, None) self.doc_admin.save_model(None, doc, None, None)
self.assertEqual(Document.objects.get(id=doc.id).title, "new title") self.assertEqual(Document.objects.get(id=doc.id).title, "new title")
m.assert_called_once() self.assertEqual(self.get_document_from_index(doc)['title'], "new title")
@mock.patch("documents.admin.index.remove_document") def test_delete_model(self):
def test_delete_model(self, m):
doc = Document.objects.create(title="test") doc = Document.objects.create(title="test")
self.doc_admin.delete_model(None, doc) index.add_or_update_document(doc)
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id) self.assertIsNotNone(self.get_document_from_index(doc))
m.assert_called_once()
@mock.patch("documents.admin.index.remove_document") self.doc_admin.delete_model(None, doc)
def test_delete_queryset(self, m):
self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id)
self.assertIsNone(self.get_document_from_index(doc))
def test_delete_queryset(self):
docs = []
for i in range(42): for i in range(42):
Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}") doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}")
docs.append(doc)
index.add_or_update_document(doc)
self.assertEqual(Document.objects.count(), 42) self.assertEqual(Document.objects.count(), 42)
for doc in docs:
self.assertIsNotNone(self.get_document_from_index(doc))
self.doc_admin.delete_queryset(None, Document.objects.all()) self.doc_admin.delete_queryset(None, Document.objects.all())
self.assertEqual(m.call_count, 42)
self.assertEqual(Document.objects.count(), 0) self.assertEqual(Document.objects.count(), 0)
for doc in docs:
self.assertIsNone(self.get_document_from_index(doc))
def test_created(self): def test_created(self):
doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12)) doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12))
self.assertEqual(self.doc_admin.created_(doc), "2020-04-12") self.assertEqual(self.doc_admin.created_(doc), "2020-04-12")

View File

@ -32,7 +32,6 @@ from rest_framework.viewsets import (
ViewSet ViewSet
) )
import documents.index as index
from paperless.db import GnuPG from paperless.db import GnuPG
from paperless.views import StandardPagination from paperless.views import StandardPagination
from .classifier import load_classifier from .classifier import load_classifier
@ -176,10 +175,12 @@ class DocumentViewSet(RetrieveModelMixin,
def update(self, request, *args, **kwargs): def update(self, request, *args, **kwargs):
response = super(DocumentViewSet, self).update( response = super(DocumentViewSet, self).update(
request, *args, **kwargs) request, *args, **kwargs)
from documents import index
index.add_or_update_document(self.get_object()) index.add_or_update_document(self.get_object())
return response return response
def destroy(self, request, *args, **kwargs): def destroy(self, request, *args, **kwargs):
from documents import index
index.remove_document_from_index(self.get_object()) index.remove_document_from_index(self.get_object())
return super(DocumentViewSet, self).destroy(request, *args, **kwargs) return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
@ -501,10 +502,6 @@ class SearchView(APIView):
permission_classes = (IsAuthenticated,) permission_classes = (IsAuthenticated,)
def __init__(self, *args, **kwargs):
super(SearchView, self).__init__(*args, **kwargs)
self.ix = index.open_index()
def add_infos_to_hit(self, r): def add_infos_to_hit(self, r):
try: try:
doc = Document.objects.get(id=r['id']) doc = Document.objects.get(id=r['id'])
@ -525,6 +522,7 @@ class SearchView(APIView):
} }
def get(self, request, format=None): def get(self, request, format=None):
from documents import index
if 'query' in request.query_params: if 'query' in request.query_params:
query = request.query_params['query'] query = request.query_params['query']
@ -554,8 +552,10 @@ class SearchView(APIView):
if page < 1: if page < 1:
page = 1 page = 1
ix = index.open_index()
try: try:
with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501 with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
return Response( return Response(
{'count': len(result_page), {'count': len(result_page),
'page': result_page.pagenum, 'page': result_page.pagenum,
@ -570,10 +570,6 @@ class SearchAutoCompleteView(APIView):
permission_classes = (IsAuthenticated,) permission_classes = (IsAuthenticated,)
def __init__(self, *args, **kwargs):
super(SearchAutoCompleteView, self).__init__(*args, **kwargs)
self.ix = index.open_index()
def get(self, request, format=None): def get(self, request, format=None):
if 'term' in request.query_params: if 'term' in request.query_params:
term = request.query_params['term'] term = request.query_params['term']
@ -587,7 +583,11 @@ class SearchAutoCompleteView(APIView):
else: else:
limit = 10 limit = 10
return Response(index.autocomplete(self.ix, term, limit)) from documents import index
ix = index.open_index()
return Response(index.autocomplete(ix, term, limit))
class StatisticsView(APIView): class StatisticsView(APIView):