Feature: Cache metadata and suggestions in Redis (#5638)

2025-05-15 12:29:29 -05:00 · 2024-02-04 10:42:21 -08:00 · 2024-02-04 10:42:21 -08:00 · 25542c56b9
commit 25542c56b9
parent 45e2b7f814
7 changed files with 415 additions and 78 deletions
--- a/src/documents/caching.py
+++ b/src/documents/caching.py
@ -0,0 +1,197 @@
+import logging
+from binascii import hexlify
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from typing import Final
+from typing import Optional
+
+from django.core.cache import cache
+
+from documents.models import Document
+
+if TYPE_CHECKING:
+    from documents.classifier import DocumentClassifier
+
+logger = logging.getLogger("paperless.caching")
+
+
+@dataclass(frozen=True)
+class MetadataCacheData:
+    original_checksum: str
+    original_metadata: list
+    archive_checksum: Optional[str]
+    archive_metadata: Optional[list]
+
+
+@dataclass(frozen=True)
+class SuggestionCacheData:
+    classifier_version: int
+    classifier_hash: str
+    suggestions: dict
+
+
+CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
+CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
+CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
+
+CACHE_1_MINUTE: Final[int] = 60
+CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
+CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
+
+
+def get_suggestion_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's suggestions
+    """
+    return f"doc_{document_id}_suggest"
+
+
+def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
+    """
+    If possible, return the cached suggestions for the given document ID.
+    The classifier needs to be matching in format and hash and the suggestions need to
+    have been cached once.
+    """
+    from documents.classifier import DocumentClassifier
+
+    doc_key = get_suggestion_cache_key(document_id)
+    cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
+    # The document suggestions are in the cache
+    if doc_key in cache_hits:
+        doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
+        # The classifier format is the same
+        # The classifier hash is the same
+        # Then the suggestions can be used
+        if (
+            CLASSIFIER_VERSION_KEY in cache_hits
+            and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
+            and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
+        ) and (
+            CLASSIFIER_HASH_KEY in cache_hits
+            and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
+        ):
+            return doc_suggestions
+        else:  # pragma: no cover
+            # Remove the key because something didn't match
+            cache.delete(doc_key)
+    return None
+
+
+def set_suggestions_cache(
+    document_id: int,
+    suggestions: dict,
+    classifier: Optional["DocumentClassifier"],
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Caches the given suggestions, which were generated by the given classifier.  If there is no classifier,
+    this function is a no-op (there won't be suggestions then anyway)
+    """
+    if classifier is not None:
+        doc_key = get_suggestion_cache_key(document_id)
+        print(classifier.last_auto_type_hash)
+        cache.set(
+            doc_key,
+            SuggestionCacheData(
+                classifier.FORMAT_VERSION,
+                hexlify(classifier.last_auto_type_hash).decode(),
+                suggestions,
+            ),
+            timeout,
+        )
+
+
+def refresh_suggestions_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the suggestions for the given document ID
+    to the given timeout
+    """
+    doc_key = get_suggestion_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_metadata_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's metadata
+    """
+    return f"doc_{document_id}_metadata"
+
+
+def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
+    """
+    Returns the cached document metadata for the given document ID, as long as the metadata
+    was cached once and the checksums have not changed
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    doc_metadata: Optional[MetadataCacheData] = cache.get(doc_key)
+    # The metadata exists in the cache
+    if doc_metadata is not None:
+        try:
+            doc = Document.objects.get(pk=document_id)
+            # The original checksums match
+            # If it has one, the archive checksums match
+            # Then, we can use the metadata
+            if (
+                doc_metadata.original_checksum == doc.checksum
+                and doc.has_archive_version
+                and doc_metadata.archive_checksum is not None
+                and doc_metadata.archive_checksum == doc.archive_checksum
+            ):
+                # Refresh cache
+                cache.touch(doc_key, CACHE_50_MINUTES)
+                return doc_metadata
+            else:  # pragma: no cover
+                # Something didn't match, delete the key
+                cache.delete(doc_key)
+        except Document.DoesNotExist:  # pragma: no cover
+            # Basically impossible, but the key existed, but the Document didn't
+            cache.delete(doc_key)
+    return None
+
+
+def set_metadata_cache(
+    document: Document,
+    original_metadata: list,
+    archive_metadata: Optional[list],
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Sets the metadata into cache for the given Document
+    """
+    doc_key = get_metadata_cache_key(document.pk)
+    cache.set(
+        doc_key,
+        MetadataCacheData(
+            document.checksum,
+            original_metadata,
+            document.archive_checksum,
+            archive_metadata,
+        ),
+        timeout,
+    )
+
+
+def refresh_metadata_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the metadata for the given document ID
+    to the given timeout
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_thumbnail_modified_key(document_id: int) -> str:
+    """
+    Builds the key to store a thumbnail's timestamp
+    """
+    return f"doc_{document_id}_thumbnail_modified"
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@ -10,8 +10,13 @@ from pathlib import Path
 from typing import Optional

 from django.conf import settings
+from django.core.cache import cache
 from sklearn.exceptions import InconsistentVersionWarning

+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
 from documents.models import Document
 from documents.models import MatchingModel

@ -208,6 +213,15 @@ class DocumentClassifier:
            and self.last_doc_change_time >= latest_doc_change
        ) and self.last_auto_type_hash == hasher.digest():
            logger.info("No updates since last training")
+            # Set the classifier information into the cache
+            # Caching for 50 minutes, so slightly less than the normal retrain time
+            cache.set(
+                CLASSIFIER_MODIFIED_KEY,
+                self.last_doc_change_time,
+                CACHE_50_MINUTES,
+            )
+            cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
+            cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
            return False

        # subtract 1 since -1 (null) is also part of the classes.
@ -322,6 +336,12 @@ class DocumentClassifier:
        self.last_doc_change_time = latest_doc_change
        self.last_auto_type_hash = hasher.digest()

+        # Set the classifier information into the cache
+        # Caching for 50 minutes, so slightly less than the normal retrain time
+        cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
+
        return True

    def preprocess_content(self, content: str) -> str:  # pragma: no cover
--- a/src/documents/conditionals.py
+++ b/src/documents/conditionals.py
@ -1,9 +1,16 @@
-import pickle
 from datetime import datetime
+from datetime import timezone
 from typing import Optional

 from django.conf import settings
+from django.core.cache import cache

+from documents.caching import CACHE_5_MINUTES
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
+from documents.caching import get_thumbnail_modified_key
 from documents.classifier import DocumentClassifier
 from documents.models import Document

@ -14,18 +21,25 @@ def suggestions_etag(request, pk: int) -> Optional[str]:
    suggestions if the classifier has not been changed and the suggested dates
    setting is also unchanged

-    TODO: It would be nice to not duplicate the partial loading and the loading
-    between here and the actual classifier
    """
+    # If no model file, no etag at all
    if not settings.MODEL_FILE.exists():
        return None
-    with open(settings.MODEL_FILE, "rb") as f:
-        schema_version = pickle.load(f)
-        if schema_version != DocumentClassifier.FORMAT_VERSION:
-            return None
-        _ = pickle.load(f)
-        last_auto_type_hash: bytes = pickle.load(f)
-        return f"{last_auto_type_hash}:{settings.NUMBER_OF_SUGGESTED_DATES}"
+    # Check cache information
+    cache_hits = cache.get_many(
+        [CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY],
+    )
+    # If the version differs somehow, no etag
+    if (
+        CLASSIFIER_VERSION_KEY in cache_hits
+        and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
+    ):
+        return None
+    elif CLASSIFIER_HASH_KEY in cache_hits:
+        # Refresh the cache and return the hash digest and the dates setting
+        cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES)
+        return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}"
+    return None


 def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
@ -34,14 +48,23 @@ def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
    as there is not way to track the suggested date setting modification, but it seems
    unlikely that changes too often
    """
+    # No file, no last modified
    if not settings.MODEL_FILE.exists():
        return None
-    with open(settings.MODEL_FILE, "rb") as f:
-        schema_version = pickle.load(f)
-        if schema_version != DocumentClassifier.FORMAT_VERSION:
-            return None
-        last_doc_change_time = pickle.load(f)
-        return last_doc_change_time
+    cache_hits = cache.get_many(
+        [CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY],
+    )
+    # If the version differs somehow, no last modified
+    if (
+        CLASSIFIER_VERSION_KEY in cache_hits
+        and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
+    ):
+        return None
+    elif CLASSIFIER_MODIFIED_KEY in cache_hits:
+        # Refresh the cache and return the last modified
+        cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES)
+        return cache_hits[CLASSIFIER_MODIFIED_KEY]
+    return None


 def metadata_etag(request, pk: int) -> Optional[str]:
@ -52,7 +75,7 @@ def metadata_etag(request, pk: int) -> Optional[str]:
    try:
        doc = Document.objects.get(pk=pk)
        return doc.checksum
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
        return None
    return None

@ -66,7 +89,7 @@ def metadata_last_modified(request, pk: int) -> Optional[datetime]:
    try:
        doc = Document.objects.get(pk=pk)
        return doc.modified
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
        return None
    return None

@ -82,6 +105,46 @@ def preview_etag(request, pk: int) -> Optional[str]:
            and request.query_params["original"] == "true"
        )
        return doc.checksum if use_original else doc.archive_checksum
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
        return None
    return None
+
+
+def preview_last_modified(request, pk: int) -> Optional[datetime]:
+    """
+    Uses the documents modified time to set the Last-Modified header.  Not strictly
+    speaking correct, but close enough and quick
+    """
+    try:
+        doc = Document.objects.get(pk=pk)
+        return doc.modified
+    except Document.DoesNotExist:  # pragma: no cover
+        return None
+    return None
+
+
+def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
+    """
+    Returns the filesystem last modified either from cache or from filesystem.
+    Cache should be (slightly?) faster than filesystem
+    """
+    try:
+        doc = Document.objects.get(pk=pk)
+        if not doc.thumbnail_path.exists():
+            return None
+        doc_key = get_thumbnail_modified_key(pk)
+
+        cache_hit = cache.get(doc_key)
+        if cache_hit is not None:
+            cache.touch(doc_key, CACHE_50_MINUTES)
+            return cache_hit
+
+        # No cache, get the timestamp and cache the datetime
+        last_modified = datetime.fromtimestamp(
+            doc.thumbnail_path.stat().st_mtime,
+            tz=timezone.utc,
+        )
+        cache.set(doc_key, last_modified, CACHE_50_MINUTES)
+        return last_modified
+    except Document.DoesNotExist:  # pragma: no cover
+        return None
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@ -4,6 +4,7 @@ import shutil
 import tempfile
 import uuid
 import zoneinfo
+from binascii import hexlify
 from datetime import timedelta
 from pathlib import Path
 from unittest import mock
@ -13,12 +14,17 @@ from dateutil import parser
 from django.conf import settings
 from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
+from django.core.cache import cache
 from django.test import override_settings
 from django.utils import timezone
 from guardian.shortcuts import assign_perm
 from rest_framework import status
 from rest_framework.test import APITestCase

+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
 from documents.models import Correspondent
 from documents.models import CustomField
 from documents.models import CustomFieldInstance
@ -40,6 +46,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):

        self.user = User.objects.create_superuser(username="temp_admin")
        self.client.force_authenticate(user=self.user)
+        cache.clear()

    def testDocuments(self):
        response = self.client.get("/api/documents/").data
@ -1162,6 +1169,9 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
        self.assertEqual(meta["original_size"], os.stat(source_file).st_size)
        self.assertEqual(meta["archive_size"], os.stat(archive_file).st_size)

+        response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
    def test_get_metadata_invalid_doc(self):
        response = self.client.get("/api/documents/34576/metadata/")
        self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND)
@ -1266,7 +1276,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
            },
        )

-    @mock.patch("documents.conditionals.pickle.load")
+    @mock.patch("documents.views.load_classifier")
    @mock.patch("documents.views.match_storage_paths")
    @mock.patch("documents.views.match_document_types")
    @mock.patch("documents.views.match_tags")
@ -1278,7 +1288,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
        match_tags,
        match_document_types,
        match_storage_paths,
-        mocked_pickle_load,
+        mocked_load,
    ):
        """
        GIVEN:
@ -1287,23 +1297,43 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
          - Classifier has not been modified
        THEN:
          - Subsequent requests are returned alright
-          - ETag and last modified are called
+          - ETag and last modified headers are set
        """
-        settings.MODEL_FILE.touch()

+        # setup the cache how the classifier does it
        from documents.classifier import DocumentClassifier

-        last_modified = timezone.now()
+        settings.MODEL_FILE.touch()

-        # ETag first, then modified
-        mock_effect = [
-            DocumentClassifier.FORMAT_VERSION,
-            "dont care",
-            b"thisisachecksum",
-            DocumentClassifier.FORMAT_VERSION,
-            last_modified,
+        classifier_checksum_bytes = b"thisisachecksum"
+        classifier_checksum_hex = hexlify(classifier_checksum_bytes).decode()
+
+        # Two loads, so two side effects
+        mocked_load.side_effect = [
+            mock.Mock(
+                last_auto_type_hash=classifier_checksum_bytes,
+                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+            ),
+            mock.Mock(
+                last_auto_type_hash=classifier_checksum_bytes,
+                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+            ),
        ]
-        mocked_pickle_load.side_effect = mock_effect
+
+        last_modified = timezone.now()
+        cache.set(CLASSIFIER_MODIFIED_KEY, last_modified, CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_HASH_KEY, classifier_checksum_hex, CACHE_50_MINUTES)
+        cache.set(
+            CLASSIFIER_VERSION_KEY,
+            DocumentClassifier.FORMAT_VERSION,
+            CACHE_50_MINUTES,
+        )
+
+        # Mock the matching
+        match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
+        match_tags.return_value = [Tag(id=56), Tag(id=123)]
+        match_document_types.return_value = [DocumentType(id=23)]
+        match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]

        doc = Document.objects.create(
            title="test",
@ -1311,12 +1341,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
            content="this is an invoice from 12.04.2022!",
        )

-        match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
-        match_tags.return_value = [Tag(id=56), Tag(id=123)]
-        match_document_types.return_value = [DocumentType(id=23)]
-        match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]
-
        response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(
            response.data,
            {
@ -1327,7 +1353,6 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
                "dates": ["2022-04-12"],
            },
        )
-        mocked_pickle_load.assert_called()
        self.assertIn("Last-Modified", response.headers)
        self.assertEqual(
            response.headers["Last-Modified"],
@ -1336,15 +1361,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
        self.assertIn("ETag", response.headers)
        self.assertEqual(
            response.headers["ETag"],
-            f"\"b'thisisachecksum':{settings.NUMBER_OF_SUGGESTED_DATES}\"",
+            f'"{classifier_checksum_hex}:{settings.NUMBER_OF_SUGGESTED_DATES}"',
        )

-        mocked_pickle_load.rest_mock()
-        mocked_pickle_load.side_effect = mock_effect
-
        response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
        self.assertEqual(response.status_code, status.HTTP_200_OK)
-        mocked_pickle_load.assert_called()

    @mock.patch("documents.parsers.parse_date_generator")
    @override_settings(NUMBER_OF_SUGGESTED_DATES=0)
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -35,6 +35,7 @@ from django.utils.translation import get_language
 from django.views import View
 from django.views.decorators.cache import cache_control
 from django.views.decorators.http import condition
+from django.views.decorators.http import last_modified
 from django.views.generic import TemplateView
 from django_filters.rest_framework import DjangoFilterBackend
 from langdetect import detect
@ -62,12 +63,21 @@ from documents import bulk_edit
 from documents.bulk_download import ArchiveOnlyStrategy
 from documents.bulk_download import OriginalAndArchiveStrategy
 from documents.bulk_download import OriginalsOnlyStrategy
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import get_metadata_cache
+from documents.caching import get_suggestion_cache
+from documents.caching import refresh_metadata_cache
+from documents.caching import refresh_suggestions_cache
+from documents.caching import set_metadata_cache
+from documents.caching import set_suggestions_cache
 from documents.classifier import load_classifier
 from documents.conditionals import metadata_etag
 from documents.conditionals import metadata_last_modified
 from documents.conditionals import preview_etag
+from documents.conditionals import preview_last_modified
 from documents.conditionals import suggestions_etag
 from documents.conditionals import suggestions_last_modified
+from documents.conditionals import thumbnail_last_modified
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
 from documents.data_models import DocumentSource
@ -379,10 +389,12 @@ class DocumentViewSet(

            try:
                return parser.extract_metadata(file, mime_type)
-            except Exception:
+            except Exception:  # pragma: no cover
+                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
                return []
-        else:
+        else:  # pragma: no cover
+            logger.warning(f"No parser for {mime_type}")
            return []

    def get_filesize(self, filename):
@ -407,16 +419,37 @@ class DocumentViewSet(
        except Document.DoesNotExist:
            raise Http404

+        document_cached_metadata = get_metadata_cache(doc.pk)
+
+        archive_metadata = None
+        archive_filesize = None
+        if document_cached_metadata is not None:
+            original_metadata = document_cached_metadata.original_metadata
+            archive_metadata = document_cached_metadata.archive_metadata
+            refresh_metadata_cache(doc.pk)
+        else:
+            original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
+
+            if doc.has_archive_version:
+                archive_filesize = self.get_filesize(doc.archive_path)
+                archive_metadata = self.get_metadata(
+                    doc.archive_path,
+                    "application/pdf",
+                )
+            set_metadata_cache(doc, original_metadata, archive_metadata)
+
        meta = {
            "original_checksum": doc.checksum,
            "original_size": self.get_filesize(doc.source_path),
            "original_mime_type": doc.mime_type,
            "media_filename": doc.filename,
            "has_archive_version": doc.has_archive_version,
-            "original_metadata": self.get_metadata(doc.source_path, doc.mime_type),
+            "original_metadata": original_metadata,
            "archive_checksum": doc.archive_checksum,
            "archive_media_filename": doc.archive_filename,
            "original_filename": doc.original_filename,
+            "archive_size": archive_filesize,
+            "archive_metadata": archive_metadata,
        }

        lang = "en"
@ -426,16 +459,6 @@ class DocumentViewSet(
            pass
        meta["lang"] = lang

-        if doc.has_archive_version:
-            meta["archive_size"] = self.get_filesize(doc.archive_path)
-            meta["archive_metadata"] = self.get_metadata(
-                doc.archive_path,
-                "application/pdf",
-            )
-        else:
-            meta["archive_size"] = None
-            meta["archive_metadata"] = None
-
        return Response(meta)

    @action(methods=["get"], detail=True)
@ -454,6 +477,12 @@ class DocumentViewSet(
        ):
            return HttpResponseForbidden("Insufficient permissions")

+        document_suggestions = get_suggestion_cache(doc.pk)
+
+        if document_suggestions is not None:
+            refresh_suggestions_cache(doc.pk)
+            return Response(document_suggestions.suggestions)
+
        classifier = load_classifier()

        dates = []
@ -463,27 +492,30 @@ class DocumentViewSet(
                {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
            )

-        return Response(
-            {
-                "correspondents": [
-                    c.id for c in match_correspondents(doc, classifier, request.user)
-                ],
-                "tags": [t.id for t in match_tags(doc, classifier, request.user)],
-                "document_types": [
-                    dt.id for dt in match_document_types(doc, classifier, request.user)
-                ],
-                "storage_paths": [
-                    dt.id for dt in match_storage_paths(doc, classifier, request.user)
-                ],
-                "dates": [
-                    date.strftime("%Y-%m-%d") for date in dates if date is not None
-                ],
-            },
-        )
+        resp_data = {
+            "correspondents": [
+                c.id for c in match_correspondents(doc, classifier, request.user)
+            ],
+            "tags": [t.id for t in match_tags(doc, classifier, request.user)],
+            "document_types": [
+                dt.id for dt in match_document_types(doc, classifier, request.user)
+            ],
+            "storage_paths": [
+                dt.id for dt in match_storage_paths(doc, classifier, request.user)
+            ],
+            "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
+        }
+
+        # Cache the suggestions and the classifier hash for later
+        set_suggestions_cache(doc.pk, resp_data, classifier)
+
+        return Response(resp_data)

    @action(methods=["get"], detail=True)
    @method_decorator(cache_control(public=False, max_age=5 * 60))
-    @method_decorator(condition(etag_func=preview_etag))
+    @method_decorator(
+        condition(etag_func=preview_etag, last_modified_func=preview_last_modified),
+    )
    def preview(self, request, pk=None):
        try:
            response = self.file_response(pk, request, "inline")
@ -492,7 +524,8 @@ class DocumentViewSet(
            raise Http404

    @action(methods=["get"], detail=True)
-    @method_decorator(cache_control(public=False, max_age=315360000))
+    @method_decorator(cache_control(public=False, max_age=CACHE_50_MINUTES))
+    @method_decorator(last_modified(thumbnail_last_modified))
    def thumb(self, request, pk=None):
        try:
            doc = Document.objects.get(id=pk)
@ -506,8 +539,6 @@ class DocumentViewSet(
                handle = GnuPG.decrypted(doc.thumbnail_file)
            else:
                handle = doc.thumbnail_file
-            # TODO: Send ETag information and use that to send new thumbnails
-            #  if available

            return HttpResponse(handle, content_type="image/webp")
        except (FileNotFoundError, Document.DoesNotExist):
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -762,8 +762,12 @@ CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")
 # django setting.
 CACHES = {
    "default": {
-        "BACKEND": "django.core.cache.backends.redis.RedisCache",
+        "BACKEND": os.environ.get(
+            "PAPERLESS_CACHE_BACKEND",
+            "django.core.cache.backends.redis.RedisCache",
+        ),
        "LOCATION": _CHANNELS_REDIS_URL,
+        "KEY_PREFIX": os.getenv("PAPERLESS_REDIS_PREFIX", ""),
    },
 }

--- a/src/setup.cfg
+++ b/src/setup.cfg
@ -3,6 +3,7 @@ DJANGO_SETTINGS_MODULE = paperless.settings
 addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
 env =
    PAPERLESS_DISABLE_DBHANDLER=true
+    PAPERLESS_CACHE_BACKEND=django.core.cache.backends.locmem.LocMemCache

 [coverage:run]
 source =