From e16645b146da24f07004eb772a455450354a37a7 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 16 Jan 2024 09:01:07 -0800 Subject: [PATCH] Feature: Add additional caching support to suggestions and metadata (#5414) * Adds ETag and Last-Modified headers to suggestions, metadata and previews * Slight update to the suggestions etag * Small user message for why classifier didn't train again --- src/documents/classifier.py | 1 + src/documents/conditionals.py | 87 +++++++++++++++++++++++ src/documents/tests/test_api_documents.py | 80 +++++++++++++++++++++ src/documents/views.py | 17 +++++ 4 files changed, 185 insertions(+) create mode 100644 src/documents/conditionals.py diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 52af2733a..5833e373e 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -207,6 +207,7 @@ class DocumentClassifier: self.last_doc_change_time is not None and self.last_doc_change_time >= latest_doc_change ) and self.last_auto_type_hash == hasher.digest(): + logger.info("No updates since last training") return False # subtract 1 since -1 (null) is also part of the classes. diff --git a/src/documents/conditionals.py b/src/documents/conditionals.py new file mode 100644 index 000000000..07e6850fb --- /dev/null +++ b/src/documents/conditionals.py @@ -0,0 +1,87 @@ +import pickle +from datetime import datetime +from typing import Optional + +from django.conf import settings + +from documents.classifier import DocumentClassifier +from documents.models import Document + + +def suggestions_etag(request, pk: int) -> Optional[str]: + """ + Returns an optional string for the ETag, allowing browser caching of + suggestions if the classifier has not been changed and the suggested dates + setting is also unchanged + + TODO: It would be nice to not duplicate the partial loading and the loading + between here and the actual classifier + """ + if not settings.MODEL_FILE.exists(): + return None + with open(settings.MODEL_FILE, "rb") as f: + schema_version = pickle.load(f) + if schema_version != DocumentClassifier.FORMAT_VERSION: + return None + _ = pickle.load(f) + last_auto_type_hash: bytes = pickle.load(f) + return f"{last_auto_type_hash}:{settings.NUMBER_OF_SUGGESTED_DATES}" + + +def suggestions_last_modified(request, pk: int) -> Optional[datetime]: + """ + Returns the datetime of classifier last modification. This is slightly off, + as there is not way to track the suggested date setting modification, but it seems + unlikely that changes too often + """ + if not settings.MODEL_FILE.exists(): + return None + with open(settings.MODEL_FILE, "rb") as f: + schema_version = pickle.load(f) + if schema_version != DocumentClassifier.FORMAT_VERSION: + return None + last_doc_change_time = pickle.load(f) + return last_doc_change_time + + +def metadata_etag(request, pk: int) -> Optional[str]: + """ + Metadata is extracted from the original file, so use its checksum as the + ETag + """ + try: + doc = Document.objects.get(pk=pk) + return doc.checksum + except Document.DoesNotExist: + return None + return None + + +def metadata_last_modified(request, pk: int) -> Optional[datetime]: + """ + Metadata is extracted from the original file, so use its modified. Strictly speaking, this is + not the modification of the original file, but of the database object, but might as well + error on the side of more cautious + """ + try: + doc = Document.objects.get(pk=pk) + return doc.modified + except Document.DoesNotExist: + return None + return None + + +def preview_etag(request, pk: int) -> Optional[str]: + """ + ETag for the document preview, using the original or archive checksum, depending on the request + """ + try: + doc = Document.objects.get(pk=pk) + use_original = ( + "original" in request.query_params + and request.query_params["original"] == "true" + ) + return doc.checksum if use_original else doc.archive_checksum + except Document.DoesNotExist: + return None + return None diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py index 66da23ef7..7b81c8df0 100644 --- a/src/documents/tests/test_api_documents.py +++ b/src/documents/tests/test_api_documents.py @@ -1266,6 +1266,86 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): }, ) + @mock.patch("documents.conditionals.pickle.load") + @mock.patch("documents.views.match_storage_paths") + @mock.patch("documents.views.match_document_types") + @mock.patch("documents.views.match_tags") + @mock.patch("documents.views.match_correspondents") + @override_settings(NUMBER_OF_SUGGESTED_DATES=10) + def test_get_suggestions_cached( + self, + match_correspondents, + match_tags, + match_document_types, + match_storage_paths, + mocked_pickle_load, + ): + """ + GIVEN: + - Request for suggestions for a document + WHEN: + - Classifier has not been modified + THEN: + - Subsequent requests are returned alright + - ETag and last modified are called + """ + settings.MODEL_FILE.touch() + + from documents.classifier import DocumentClassifier + + last_modified = timezone.now() + + # ETag first, then modified + mock_effect = [ + DocumentClassifier.FORMAT_VERSION, + "dont care", + b"thisisachecksum", + DocumentClassifier.FORMAT_VERSION, + last_modified, + ] + mocked_pickle_load.side_effect = mock_effect + + doc = Document.objects.create( + title="test", + mime_type="application/pdf", + content="this is an invoice from 12.04.2022!", + ) + + match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)] + match_tags.return_value = [Tag(id=56), Tag(id=123)] + match_document_types.return_value = [DocumentType(id=23)] + match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)] + + response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") + self.assertEqual( + response.data, + { + "correspondents": [88, 2], + "tags": [56, 123], + "document_types": [23], + "storage_paths": [99, 77], + "dates": ["2022-04-12"], + }, + ) + mocked_pickle_load.assert_called() + self.assertIn("Last-Modified", response.headers) + self.assertEqual( + response.headers["Last-Modified"], + last_modified.strftime("%a, %d %b %Y %H:%M:%S %Z").replace("UTC", "GMT"), + ) + self.assertIn("ETag", response.headers) + self.assertEqual( + response.headers["ETag"], + f"\"b'thisisachecksum':{settings.NUMBER_OF_SUGGESTED_DATES}\"", + ) + + mocked_pickle_load.rest_mock() + mocked_pickle_load.side_effect = mock_effect + + response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") + self.assertEqual(response.status_code, status.HTTP_200_OK) + mocked_pickle_load.assert_called() + @mock.patch("documents.parsers.parse_date_generator") @override_settings(NUMBER_OF_SUGGESTED_DATES=0) def test_get_suggestions_dates_disabled( diff --git a/src/documents/views.py b/src/documents/views.py index b545a1466..6dd8f3baa 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -34,6 +34,7 @@ from django.utils.decorators import method_decorator from django.utils.translation import get_language from django.views import View from django.views.decorators.cache import cache_control +from django.views.decorators.http import condition from django.views.generic import TemplateView from django_filters.rest_framework import DjangoFilterBackend from langdetect import detect @@ -62,6 +63,11 @@ from documents.bulk_download import ArchiveOnlyStrategy from documents.bulk_download import OriginalAndArchiveStrategy from documents.bulk_download import OriginalsOnlyStrategy from documents.classifier import load_classifier +from documents.conditionals import metadata_etag +from documents.conditionals import metadata_last_modified +from documents.conditionals import preview_etag +from documents.conditionals import suggestions_etag +from documents.conditionals import suggestions_last_modified from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentSource @@ -386,6 +392,9 @@ class DocumentViewSet( return None @action(methods=["get"], detail=True) + @method_decorator( + condition(etag_func=metadata_etag, last_modified_func=metadata_last_modified), + ) def metadata(self, request, pk=None): try: doc = Document.objects.get(pk=pk) @@ -430,6 +439,12 @@ class DocumentViewSet( return Response(meta) @action(methods=["get"], detail=True) + @method_decorator( + condition( + etag_func=suggestions_etag, + last_modified_func=suggestions_last_modified, + ), + ) def suggestions(self, request, pk=None): doc = get_object_or_404(Document, pk=pk) if request.user is not None and not has_perms_owner_aware( @@ -467,6 +482,8 @@ class DocumentViewSet( ) @action(methods=["get"], detail=True) + @method_decorator(cache_control(public=False, max_age=5 * 60)) + @method_decorator(condition(etag_func=preview_etag)) def preview(self, request, pk=None): try: response = self.file_response(pk, request, "inline")