Feature: Add additional caching support to suggestions and metadata (#5414)

* Adds ETag and Last-Modified headers to suggestions, metadata and previews

* Slight update to the suggestions etag

* Small user message for why classifier didn't train again
This commit is contained in:
Trenton H 2024-01-16 09:01:07 -08:00 committed by GitHub
parent 0068f091bb
commit e16645b146
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 185 additions and 0 deletions

View File

@ -207,6 +207,7 @@ class DocumentClassifier:
self.last_doc_change_time is not None
and self.last_doc_change_time >= latest_doc_change
) and self.last_auto_type_hash == hasher.digest():
logger.info("No updates since last training")
return False
# subtract 1 since -1 (null) is also part of the classes.

View File

@ -0,0 +1,87 @@
import pickle
from datetime import datetime
from typing import Optional
from django.conf import settings
from documents.classifier import DocumentClassifier
from documents.models import Document
def suggestions_etag(request, pk: int) -> Optional[str]:
"""
Returns an optional string for the ETag, allowing browser caching of
suggestions if the classifier has not been changed and the suggested dates
setting is also unchanged
TODO: It would be nice to not duplicate the partial loading and the loading
between here and the actual classifier
"""
if not settings.MODEL_FILE.exists():
return None
with open(settings.MODEL_FILE, "rb") as f:
schema_version = pickle.load(f)
if schema_version != DocumentClassifier.FORMAT_VERSION:
return None
_ = pickle.load(f)
last_auto_type_hash: bytes = pickle.load(f)
return f"{last_auto_type_hash}:{settings.NUMBER_OF_SUGGESTED_DATES}"
def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
"""
Returns the datetime of classifier last modification. This is slightly off,
as there is not way to track the suggested date setting modification, but it seems
unlikely that changes too often
"""
if not settings.MODEL_FILE.exists():
return None
with open(settings.MODEL_FILE, "rb") as f:
schema_version = pickle.load(f)
if schema_version != DocumentClassifier.FORMAT_VERSION:
return None
last_doc_change_time = pickle.load(f)
return last_doc_change_time
def metadata_etag(request, pk: int) -> Optional[str]:
"""
Metadata is extracted from the original file, so use its checksum as the
ETag
"""
try:
doc = Document.objects.get(pk=pk)
return doc.checksum
except Document.DoesNotExist:
return None
return None
def metadata_last_modified(request, pk: int) -> Optional[datetime]:
"""
Metadata is extracted from the original file, so use its modified. Strictly speaking, this is
not the modification of the original file, but of the database object, but might as well
error on the side of more cautious
"""
try:
doc = Document.objects.get(pk=pk)
return doc.modified
except Document.DoesNotExist:
return None
return None
def preview_etag(request, pk: int) -> Optional[str]:
"""
ETag for the document preview, using the original or archive checksum, depending on the request
"""
try:
doc = Document.objects.get(pk=pk)
use_original = (
"original" in request.query_params
and request.query_params["original"] == "true"
)
return doc.checksum if use_original else doc.archive_checksum
except Document.DoesNotExist:
return None
return None

View File

@ -1266,6 +1266,86 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
},
)
@mock.patch("documents.conditionals.pickle.load")
@mock.patch("documents.views.match_storage_paths")
@mock.patch("documents.views.match_document_types")
@mock.patch("documents.views.match_tags")
@mock.patch("documents.views.match_correspondents")
@override_settings(NUMBER_OF_SUGGESTED_DATES=10)
def test_get_suggestions_cached(
self,
match_correspondents,
match_tags,
match_document_types,
match_storage_paths,
mocked_pickle_load,
):
"""
GIVEN:
- Request for suggestions for a document
WHEN:
- Classifier has not been modified
THEN:
- Subsequent requests are returned alright
- ETag and last modified are called
"""
settings.MODEL_FILE.touch()
from documents.classifier import DocumentClassifier
last_modified = timezone.now()
# ETag first, then modified
mock_effect = [
DocumentClassifier.FORMAT_VERSION,
"dont care",
b"thisisachecksum",
DocumentClassifier.FORMAT_VERSION,
last_modified,
]
mocked_pickle_load.side_effect = mock_effect
doc = Document.objects.create(
title="test",
mime_type="application/pdf",
content="this is an invoice from 12.04.2022!",
)
match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
match_tags.return_value = [Tag(id=56), Tag(id=123)]
match_document_types.return_value = [DocumentType(id=23)]
match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(
response.data,
{
"correspondents": [88, 2],
"tags": [56, 123],
"document_types": [23],
"storage_paths": [99, 77],
"dates": ["2022-04-12"],
},
)
mocked_pickle_load.assert_called()
self.assertIn("Last-Modified", response.headers)
self.assertEqual(
response.headers["Last-Modified"],
last_modified.strftime("%a, %d %b %Y %H:%M:%S %Z").replace("UTC", "GMT"),
)
self.assertIn("ETag", response.headers)
self.assertEqual(
response.headers["ETag"],
f"\"b'thisisachecksum':{settings.NUMBER_OF_SUGGESTED_DATES}\"",
)
mocked_pickle_load.rest_mock()
mocked_pickle_load.side_effect = mock_effect
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(response.status_code, status.HTTP_200_OK)
mocked_pickle_load.assert_called()
@mock.patch("documents.parsers.parse_date_generator")
@override_settings(NUMBER_OF_SUGGESTED_DATES=0)
def test_get_suggestions_dates_disabled(

View File

@ -34,6 +34,7 @@ from django.utils.decorators import method_decorator
from django.utils.translation import get_language
from django.views import View
from django.views.decorators.cache import cache_control
from django.views.decorators.http import condition
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from langdetect import detect
@ -62,6 +63,11 @@ from documents.bulk_download import ArchiveOnlyStrategy
from documents.bulk_download import OriginalAndArchiveStrategy
from documents.bulk_download import OriginalsOnlyStrategy
from documents.classifier import load_classifier
from documents.conditionals import metadata_etag
from documents.conditionals import metadata_last_modified
from documents.conditionals import preview_etag
from documents.conditionals import suggestions_etag
from documents.conditionals import suggestions_last_modified
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
@ -386,6 +392,9 @@ class DocumentViewSet(
return None
@action(methods=["get"], detail=True)
@method_decorator(
condition(etag_func=metadata_etag, last_modified_func=metadata_last_modified),
)
def metadata(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
@ -430,6 +439,12 @@ class DocumentViewSet(
return Response(meta)
@action(methods=["get"], detail=True)
@method_decorator(
condition(
etag_func=suggestions_etag,
last_modified_func=suggestions_last_modified,
),
)
def suggestions(self, request, pk=None):
doc = get_object_or_404(Document, pk=pk)
if request.user is not None and not has_perms_owner_aware(
@ -467,6 +482,8 @@ class DocumentViewSet(
)
@action(methods=["get"], detail=True)
@method_decorator(cache_control(public=False, max_age=5 * 60))
@method_decorator(condition(etag_func=preview_etag))
def preview(self, request, pk=None):
try:
response = self.file_response(pk, request, "inline")