mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	some search index optimizations
This commit is contained in:
		| @@ -1,7 +1,5 @@ | |||||||
| from django.contrib import admin | from django.contrib import admin | ||||||
| from whoosh.writing import AsyncWriter |  | ||||||
|  |  | ||||||
| from . import index |  | ||||||
| from .models import Correspondent, Document, DocumentType, Tag, \ | from .models import Correspondent, Document, DocumentType, Tag, \ | ||||||
|     SavedView, SavedViewFilterRule |     SavedView, SavedViewFilterRule | ||||||
|  |  | ||||||
| @@ -84,17 +82,21 @@ class DocumentAdmin(admin.ModelAdmin): | |||||||
|     created_.short_description = "Created" |     created_.short_description = "Created" | ||||||
|  |  | ||||||
|     def delete_queryset(self, request, queryset): |     def delete_queryset(self, request, queryset): | ||||||
|         ix = index.open_index() |         from documents import index | ||||||
|         with AsyncWriter(ix) as writer: |  | ||||||
|  |         with index.open_index_writer() as writer: | ||||||
|             for o in queryset: |             for o in queryset: | ||||||
|                 index.remove_document(writer, o) |                 index.remove_document(writer, o) | ||||||
|  |  | ||||||
|         super(DocumentAdmin, self).delete_queryset(request, queryset) |         super(DocumentAdmin, self).delete_queryset(request, queryset) | ||||||
|  |  | ||||||
|     def delete_model(self, request, obj): |     def delete_model(self, request, obj): | ||||||
|  |         from documents import index | ||||||
|         index.remove_document_from_index(obj) |         index.remove_document_from_index(obj) | ||||||
|         super(DocumentAdmin, self).delete_model(request, obj) |         super(DocumentAdmin, self).delete_model(request, obj) | ||||||
|  |  | ||||||
|     def save_model(self, request, obj, form, change): |     def save_model(self, request, obj, form, change): | ||||||
|  |         from documents import index | ||||||
|         index.add_or_update_document(obj) |         index.add_or_update_document(obj) | ||||||
|         super(DocumentAdmin, self).save_model(request, obj, form, change) |         super(DocumentAdmin, self).save_model(request, obj, form, change) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,9 +2,7 @@ import itertools | |||||||
|  |  | ||||||
| from django.db.models import Q | from django.db.models import Q | ||||||
| from django_q.tasks import async_task | from django_q.tasks import async_task | ||||||
| from whoosh.writing import AsyncWriter |  | ||||||
|  |  | ||||||
| from documents import index |  | ||||||
| from documents.models import Document, Correspondent, DocumentType | from documents.models import Document, Correspondent, DocumentType | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -99,8 +97,9 @@ def modify_tags(doc_ids, add_tags, remove_tags): | |||||||
| def delete(doc_ids): | def delete(doc_ids): | ||||||
|     Document.objects.filter(id__in=doc_ids).delete() |     Document.objects.filter(id__in=doc_ids).delete() | ||||||
|  |  | ||||||
|     ix = index.open_index() |     from documents import index | ||||||
|     with AsyncWriter(ix) as writer: |  | ||||||
|  |     with index.open_index_writer() as writer: | ||||||
|         for id in doc_ids: |         for id in doc_ids: | ||||||
|             index.remove_document_by_id(writer, id) |             index.remove_document_by_id(writer, id) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -86,6 +86,22 @@ def open_index(recreate=False): | |||||||
|     return create_in(settings.INDEX_DIR, get_schema()) |     return create_in(settings.INDEX_DIR, get_schema()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @contextmanager | ||||||
|  | def open_index_writer(ix=None, optimize=False): | ||||||
|  |     if ix: | ||||||
|  |         writer = AsyncWriter(ix) | ||||||
|  |     else: | ||||||
|  |         writer = AsyncWriter(open_index()) | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         yield writer | ||||||
|  |     except Exception as e: | ||||||
|  |         logger.exception(str(e)) | ||||||
|  |         writer.cancel() | ||||||
|  |     finally: | ||||||
|  |         writer.commit(optimize=optimize) | ||||||
|  |  | ||||||
|  |  | ||||||
| def update_document(writer, doc): | def update_document(writer, doc): | ||||||
|     tags = ",".join([t.name for t in doc.tags.all()]) |     tags = ",".join([t.name for t in doc.tags.all()]) | ||||||
|     writer.update_document( |     writer.update_document( | ||||||
| @@ -110,14 +126,12 @@ def remove_document_by_id(writer, doc_id): | |||||||
|  |  | ||||||
|  |  | ||||||
| def add_or_update_document(document): | def add_or_update_document(document): | ||||||
|     ix = open_index() |     with open_index_writer() as writer: | ||||||
|     with AsyncWriter(ix) as writer: |  | ||||||
|         update_document(writer, document) |         update_document(writer, document) | ||||||
|  |  | ||||||
|  |  | ||||||
| def remove_document_from_index(document): | def remove_document_from_index(document): | ||||||
|     ix = open_index() |     with open_index_writer() as writer: | ||||||
|     with AsyncWriter(ix) as writer: |  | ||||||
|         remove_document(writer, document) |         remove_document(writer, document) | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -11,7 +11,7 @@ from django.dispatch import receiver | |||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| from filelock import FileLock | from filelock import FileLock | ||||||
|  |  | ||||||
| from .. import index, matching | from .. import matching | ||||||
| from ..file_handling import delete_empty_directories, \ | from ..file_handling import delete_empty_directories, \ | ||||||
|     create_source_path_directory, \ |     create_source_path_directory, \ | ||||||
|     generate_unique_filename |     generate_unique_filename | ||||||
| @@ -305,4 +305,6 @@ def set_log_entry(sender, document=None, logging_group=None, **kwargs): | |||||||
|  |  | ||||||
|  |  | ||||||
| def add_to_index(sender, document, **kwargs): | def add_to_index(sender, document, **kwargs): | ||||||
|  |     from documents import index | ||||||
|  |  | ||||||
|     index.add_or_update_document(document) |     index.add_or_update_document(document) | ||||||
|   | |||||||
| @@ -4,6 +4,7 @@ from django.contrib.admin.sites import AdminSite | |||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
|  |  | ||||||
|  | from documents import index | ||||||
| from documents.admin import DocumentAdmin | from documents.admin import DocumentAdmin | ||||||
| from documents.models import Document | from documents.models import Document | ||||||
| from documents.tests.utils import DirectoriesMixin | from documents.tests.utils import DirectoriesMixin | ||||||
| @@ -11,37 +12,52 @@ from documents.tests.utils import DirectoriesMixin | |||||||
|  |  | ||||||
| class TestDocumentAdmin(DirectoriesMixin, TestCase): | class TestDocumentAdmin(DirectoriesMixin, TestCase): | ||||||
|  |  | ||||||
|  |     def get_document_from_index(self, doc): | ||||||
|  |         ix = index.open_index() | ||||||
|  |         with ix.searcher() as searcher: | ||||||
|  |             return searcher.document(id=doc.id) | ||||||
|  |  | ||||||
|     def setUp(self) -> None: |     def setUp(self) -> None: | ||||||
|         super(TestDocumentAdmin, self).setUp() |         super(TestDocumentAdmin, self).setUp() | ||||||
|         self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite()) |         self.doc_admin = DocumentAdmin(model=Document, admin_site=AdminSite()) | ||||||
|  |  | ||||||
|     @mock.patch("documents.admin.index.add_or_update_document") |     def test_save_model(self): | ||||||
|     def test_save_model(self, m): |  | ||||||
|         doc = Document.objects.create(title="test") |         doc = Document.objects.create(title="test") | ||||||
|  |  | ||||||
|         doc.title = "new title" |         doc.title = "new title" | ||||||
|         self.doc_admin.save_model(None, doc, None, None) |         self.doc_admin.save_model(None, doc, None, None) | ||||||
|         self.assertEqual(Document.objects.get(id=doc.id).title, "new title") |         self.assertEqual(Document.objects.get(id=doc.id).title, "new title") | ||||||
|         m.assert_called_once() |         self.assertEqual(self.get_document_from_index(doc)['title'], "new title") | ||||||
|  |  | ||||||
|     @mock.patch("documents.admin.index.remove_document") |     def test_delete_model(self): | ||||||
|     def test_delete_model(self, m): |  | ||||||
|         doc = Document.objects.create(title="test") |         doc = Document.objects.create(title="test") | ||||||
|         self.doc_admin.delete_model(None, doc) |         index.add_or_update_document(doc) | ||||||
|         self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id) |         self.assertIsNotNone(self.get_document_from_index(doc)) | ||||||
|         m.assert_called_once() |  | ||||||
|  |  | ||||||
|     @mock.patch("documents.admin.index.remove_document") |         self.doc_admin.delete_model(None, doc) | ||||||
|     def test_delete_queryset(self, m): |  | ||||||
|  |         self.assertRaises(Document.DoesNotExist, Document.objects.get, id=doc.id) | ||||||
|  |         self.assertIsNone(self.get_document_from_index(doc)) | ||||||
|  |  | ||||||
|  |     def test_delete_queryset(self): | ||||||
|  |         docs = [] | ||||||
|         for i in range(42): |         for i in range(42): | ||||||
|             Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}") |             doc = Document.objects.create(title="Many documents with the same title", checksum=f"{i:02}") | ||||||
|  |             docs.append(doc) | ||||||
|  |             index.add_or_update_document(doc) | ||||||
|  |  | ||||||
|         self.assertEqual(Document.objects.count(), 42) |         self.assertEqual(Document.objects.count(), 42) | ||||||
|  |  | ||||||
|  |         for doc in docs: | ||||||
|  |             self.assertIsNotNone(self.get_document_from_index(doc)) | ||||||
|  |  | ||||||
|         self.doc_admin.delete_queryset(None, Document.objects.all()) |         self.doc_admin.delete_queryset(None, Document.objects.all()) | ||||||
|  |  | ||||||
|         self.assertEqual(m.call_count, 42) |  | ||||||
|         self.assertEqual(Document.objects.count(), 0) |         self.assertEqual(Document.objects.count(), 0) | ||||||
|  |  | ||||||
|  |         for doc in docs: | ||||||
|  |             self.assertIsNone(self.get_document_from_index(doc)) | ||||||
|  |  | ||||||
|     def test_created(self): |     def test_created(self): | ||||||
|         doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12)) |         doc = Document.objects.create(title="test", created=timezone.datetime(2020, 4, 12)) | ||||||
|         self.assertEqual(self.doc_admin.created_(doc), "2020-04-12") |         self.assertEqual(self.doc_admin.created_(doc), "2020-04-12") | ||||||
|   | |||||||
| @@ -32,7 +32,6 @@ from rest_framework.viewsets import ( | |||||||
|     ViewSet |     ViewSet | ||||||
| ) | ) | ||||||
|  |  | ||||||
| import documents.index as index |  | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
| from paperless.views import StandardPagination | from paperless.views import StandardPagination | ||||||
| from .classifier import load_classifier | from .classifier import load_classifier | ||||||
| @@ -176,10 +175,12 @@ class DocumentViewSet(RetrieveModelMixin, | |||||||
|     def update(self, request, *args, **kwargs): |     def update(self, request, *args, **kwargs): | ||||||
|         response = super(DocumentViewSet, self).update( |         response = super(DocumentViewSet, self).update( | ||||||
|             request, *args, **kwargs) |             request, *args, **kwargs) | ||||||
|  |         from documents import index | ||||||
|         index.add_or_update_document(self.get_object()) |         index.add_or_update_document(self.get_object()) | ||||||
|         return response |         return response | ||||||
|  |  | ||||||
|     def destroy(self, request, *args, **kwargs): |     def destroy(self, request, *args, **kwargs): | ||||||
|  |         from documents import index | ||||||
|         index.remove_document_from_index(self.get_object()) |         index.remove_document_from_index(self.get_object()) | ||||||
|         return super(DocumentViewSet, self).destroy(request, *args, **kwargs) |         return super(DocumentViewSet, self).destroy(request, *args, **kwargs) | ||||||
|  |  | ||||||
| @@ -501,10 +502,6 @@ class SearchView(APIView): | |||||||
|  |  | ||||||
|     permission_classes = (IsAuthenticated,) |     permission_classes = (IsAuthenticated,) | ||||||
|  |  | ||||||
|     def __init__(self, *args, **kwargs): |  | ||||||
|         super(SearchView, self).__init__(*args, **kwargs) |  | ||||||
|         self.ix = index.open_index() |  | ||||||
|  |  | ||||||
|     def add_infos_to_hit(self, r): |     def add_infos_to_hit(self, r): | ||||||
|         try: |         try: | ||||||
|             doc = Document.objects.get(id=r['id']) |             doc = Document.objects.get(id=r['id']) | ||||||
| @@ -525,6 +522,7 @@ class SearchView(APIView): | |||||||
|                 } |                 } | ||||||
|  |  | ||||||
|     def get(self, request, format=None): |     def get(self, request, format=None): | ||||||
|  |         from documents import index | ||||||
|  |  | ||||||
|         if 'query' in request.query_params: |         if 'query' in request.query_params: | ||||||
|             query = request.query_params['query'] |             query = request.query_params['query'] | ||||||
| @@ -554,8 +552,10 @@ class SearchView(APIView): | |||||||
|         if page < 1: |         if page < 1: | ||||||
|             page = 1 |             page = 1 | ||||||
|  |  | ||||||
|  |         ix = index.open_index() | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             with index.query_page(self.ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query):  # NOQA: E501 |             with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query):  # NOQA: E501 | ||||||
|                 return Response( |                 return Response( | ||||||
|                     {'count': len(result_page), |                     {'count': len(result_page), | ||||||
|                      'page': result_page.pagenum, |                      'page': result_page.pagenum, | ||||||
| @@ -570,10 +570,6 @@ class SearchAutoCompleteView(APIView): | |||||||
|  |  | ||||||
|     permission_classes = (IsAuthenticated,) |     permission_classes = (IsAuthenticated,) | ||||||
|  |  | ||||||
|     def __init__(self, *args, **kwargs): |  | ||||||
|         super(SearchAutoCompleteView, self).__init__(*args, **kwargs) |  | ||||||
|         self.ix = index.open_index() |  | ||||||
|  |  | ||||||
|     def get(self, request, format=None): |     def get(self, request, format=None): | ||||||
|         if 'term' in request.query_params: |         if 'term' in request.query_params: | ||||||
|             term = request.query_params['term'] |             term = request.query_params['term'] | ||||||
| @@ -587,7 +583,11 @@ class SearchAutoCompleteView(APIView): | |||||||
|         else: |         else: | ||||||
|             limit = 10 |             limit = 10 | ||||||
|  |  | ||||||
|         return Response(index.autocomplete(self.ix, term, limit)) |         from documents import index | ||||||
|  |  | ||||||
|  |         ix = index.open_index() | ||||||
|  |  | ||||||
|  |         return Response(index.autocomplete(ix, term, limit)) | ||||||
|  |  | ||||||
|  |  | ||||||
| class StatisticsView(APIView): | class StatisticsView(APIView): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler