From 25542c56b9dd5f4f02edfb67df5c6802cbcd1345 Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Sun, 4 Feb 2024 10:42:21 -0800
Subject: [PATCH] Feature: Cache metadata and suggestions in Redis (#5638)

---
 src/documents/caching.py                  | 197 ++++++++++++++++++++++
 src/documents/classifier.py               |  20 +++
 src/documents/conditionals.py             | 101 ++++++++---
 src/documents/tests/test_api_documents.py |  69 +++++---
 src/documents/views.py                    |  99 +++++++----
 src/paperless/settings.py                 |   6 +-
 src/setup.cfg                             |   1 +
 7 files changed, 415 insertions(+), 78 deletions(-)
 create mode 100644 src/documents/caching.py

diff --git a/src/documents/caching.py b/src/documents/caching.py
new file mode 100644
index 000000000..9b8607dd8
--- /dev/null
+++ b/src/documents/caching.py
@@ -0,0 +1,197 @@
+import logging
+from binascii import hexlify
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from typing import Final
+from typing import Optional
+
+from django.core.cache import cache
+
+from documents.models import Document
+
+if TYPE_CHECKING:
+    from documents.classifier import DocumentClassifier
+
+logger = logging.getLogger("paperless.caching")
+
+
+@dataclass(frozen=True)
+class MetadataCacheData:
+    original_checksum: str
+    original_metadata: list
+    archive_checksum: Optional[str]
+    archive_metadata: Optional[list]
+
+
+@dataclass(frozen=True)
+class SuggestionCacheData:
+    classifier_version: int
+    classifier_hash: str
+    suggestions: dict
+
+
+CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
+CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
+CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
+
+CACHE_1_MINUTE: Final[int] = 60
+CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
+CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
+
+
+def get_suggestion_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's suggestions
+    """
+    return f"doc_{document_id}_suggest"
+
+
+def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
+    """
+    If possible, return the cached suggestions for the given document ID.
+    The classifier needs to be matching in format and hash and the suggestions need to
+    have been cached once.
+    """
+    from documents.classifier import DocumentClassifier
+
+    doc_key = get_suggestion_cache_key(document_id)
+    cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
+    # The document suggestions are in the cache
+    if doc_key in cache_hits:
+        doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
+        # The classifier format is the same
+        # The classifier hash is the same
+        # Then the suggestions can be used
+        if (
+            CLASSIFIER_VERSION_KEY in cache_hits
+            and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
+            and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
+        ) and (
+            CLASSIFIER_HASH_KEY in cache_hits
+            and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
+        ):
+            return doc_suggestions
+        else:  # pragma: no cover
+            # Remove the key because something didn't match
+            cache.delete(doc_key)
+    return None
+
+
+def set_suggestions_cache(
+    document_id: int,
+    suggestions: dict,
+    classifier: Optional["DocumentClassifier"],
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Caches the given suggestions, which were generated by the given classifier.  If there is no classifier,
+    this function is a no-op (there won't be suggestions then anyway)
+    """
+    if classifier is not None:
+        doc_key = get_suggestion_cache_key(document_id)
+        print(classifier.last_auto_type_hash)
+        cache.set(
+            doc_key,
+            SuggestionCacheData(
+                classifier.FORMAT_VERSION,
+                hexlify(classifier.last_auto_type_hash).decode(),
+                suggestions,
+            ),
+            timeout,
+        )
+
+
+def refresh_suggestions_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the suggestions for the given document ID
+    to the given timeout
+    """
+    doc_key = get_suggestion_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_metadata_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's metadata
+    """
+    return f"doc_{document_id}_metadata"
+
+
+def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
+    """
+    Returns the cached document metadata for the given document ID, as long as the metadata
+    was cached once and the checksums have not changed
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    doc_metadata: Optional[MetadataCacheData] = cache.get(doc_key)
+    # The metadata exists in the cache
+    if doc_metadata is not None:
+        try:
+            doc = Document.objects.get(pk=document_id)
+            # The original checksums match
+            # If it has one, the archive checksums match
+            # Then, we can use the metadata
+            if (
+                doc_metadata.original_checksum == doc.checksum
+                and doc.has_archive_version
+                and doc_metadata.archive_checksum is not None
+                and doc_metadata.archive_checksum == doc.archive_checksum
+            ):
+                # Refresh cache
+                cache.touch(doc_key, CACHE_50_MINUTES)
+                return doc_metadata
+            else:  # pragma: no cover
+                # Something didn't match, delete the key
+                cache.delete(doc_key)
+        except Document.DoesNotExist:  # pragma: no cover
+            # Basically impossible, but the key existed, but the Document didn't
+            cache.delete(doc_key)
+    return None
+
+
+def set_metadata_cache(
+    document: Document,
+    original_metadata: list,
+    archive_metadata: Optional[list],
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Sets the metadata into cache for the given Document
+    """
+    doc_key = get_metadata_cache_key(document.pk)
+    cache.set(
+        doc_key,
+        MetadataCacheData(
+            document.checksum,
+            original_metadata,
+            document.archive_checksum,
+            archive_metadata,
+        ),
+        timeout,
+    )
+
+
+def refresh_metadata_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the metadata for the given document ID
+    to the given timeout
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_thumbnail_modified_key(document_id: int) -> str:
+    """
+    Builds the key to store a thumbnail's timestamp
+    """
+    return f"doc_{document_id}_thumbnail_modified"
diff --git a/src/documents/classifier.py b/src/documents/classifier.py
index 5833e373e..6180a8671 100644
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -10,8 +10,13 @@ from pathlib import Path
 from typing import Optional
 
 from django.conf import settings
+from django.core.cache import cache
 from sklearn.exceptions import InconsistentVersionWarning
 
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
 from documents.models import Document
 from documents.models import MatchingModel
 
@@ -208,6 +213,15 @@ class DocumentClassifier:
             and self.last_doc_change_time >= latest_doc_change
         ) and self.last_auto_type_hash == hasher.digest():
             logger.info("No updates since last training")
+            # Set the classifier information into the cache
+            # Caching for 50 minutes, so slightly less than the normal retrain time
+            cache.set(
+                CLASSIFIER_MODIFIED_KEY,
+                self.last_doc_change_time,
+                CACHE_50_MINUTES,
+            )
+            cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
+            cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
             return False
 
         # subtract 1 since -1 (null) is also part of the classes.
@@ -322,6 +336,12 @@ class DocumentClassifier:
         self.last_doc_change_time = latest_doc_change
         self.last_auto_type_hash = hasher.digest()
 
+        # Set the classifier information into the cache
+        # Caching for 50 minutes, so slightly less than the normal retrain time
+        cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
+
         return True
 
     def preprocess_content(self, content: str) -> str:  # pragma: no cover
diff --git a/src/documents/conditionals.py b/src/documents/conditionals.py
index 07e6850fb..1b53dfe2b 100644
--- a/src/documents/conditionals.py
+++ b/src/documents/conditionals.py
@@ -1,9 +1,16 @@
-import pickle
 from datetime import datetime
+from datetime import timezone
 from typing import Optional
 
 from django.conf import settings
+from django.core.cache import cache
 
+from documents.caching import CACHE_5_MINUTES
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
+from documents.caching import get_thumbnail_modified_key
 from documents.classifier import DocumentClassifier
 from documents.models import Document
 
@@ -14,18 +21,25 @@ def suggestions_etag(request, pk: int) -> Optional[str]:
     suggestions if the classifier has not been changed and the suggested dates
     setting is also unchanged
 
-    TODO: It would be nice to not duplicate the partial loading and the loading
-    between here and the actual classifier
     """
+    # If no model file, no etag at all
     if not settings.MODEL_FILE.exists():
         return None
-    with open(settings.MODEL_FILE, "rb") as f:
-        schema_version = pickle.load(f)
-        if schema_version != DocumentClassifier.FORMAT_VERSION:
-            return None
-        _ = pickle.load(f)
-        last_auto_type_hash: bytes = pickle.load(f)
-        return f"{last_auto_type_hash}:{settings.NUMBER_OF_SUGGESTED_DATES}"
+    # Check cache information
+    cache_hits = cache.get_many(
+        [CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY],
+    )
+    # If the version differs somehow, no etag
+    if (
+        CLASSIFIER_VERSION_KEY in cache_hits
+        and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
+    ):
+        return None
+    elif CLASSIFIER_HASH_KEY in cache_hits:
+        # Refresh the cache and return the hash digest and the dates setting
+        cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES)
+        return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}"
+    return None
 
 
 def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
@@ -34,14 +48,23 @@ def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
     as there is not way to track the suggested date setting modification, but it seems
     unlikely that changes too often
     """
+    # No file, no last modified
     if not settings.MODEL_FILE.exists():
         return None
-    with open(settings.MODEL_FILE, "rb") as f:
-        schema_version = pickle.load(f)
-        if schema_version != DocumentClassifier.FORMAT_VERSION:
-            return None
-        last_doc_change_time = pickle.load(f)
-        return last_doc_change_time
+    cache_hits = cache.get_many(
+        [CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY],
+    )
+    # If the version differs somehow, no last modified
+    if (
+        CLASSIFIER_VERSION_KEY in cache_hits
+        and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
+    ):
+        return None
+    elif CLASSIFIER_MODIFIED_KEY in cache_hits:
+        # Refresh the cache and return the last modified
+        cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES)
+        return cache_hits[CLASSIFIER_MODIFIED_KEY]
+    return None
 
 
 def metadata_etag(request, pk: int) -> Optional[str]:
@@ -52,7 +75,7 @@ def metadata_etag(request, pk: int) -> Optional[str]:
     try:
         doc = Document.objects.get(pk=pk)
         return doc.checksum
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
         return None
     return None
 
@@ -66,7 +89,7 @@ def metadata_last_modified(request, pk: int) -> Optional[datetime]:
     try:
         doc = Document.objects.get(pk=pk)
         return doc.modified
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
         return None
     return None
 
@@ -82,6 +105,46 @@ def preview_etag(request, pk: int) -> Optional[str]:
             and request.query_params["original"] == "true"
         )
         return doc.checksum if use_original else doc.archive_checksum
-    except Document.DoesNotExist:
+    except Document.DoesNotExist:  # pragma: no cover
         return None
     return None
+
+
+def preview_last_modified(request, pk: int) -> Optional[datetime]:
+    """
+    Uses the documents modified time to set the Last-Modified header.  Not strictly
+    speaking correct, but close enough and quick
+    """
+    try:
+        doc = Document.objects.get(pk=pk)
+        return doc.modified
+    except Document.DoesNotExist:  # pragma: no cover
+        return None
+    return None
+
+
+def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
+    """
+    Returns the filesystem last modified either from cache or from filesystem.
+    Cache should be (slightly?) faster than filesystem
+    """
+    try:
+        doc = Document.objects.get(pk=pk)
+        if not doc.thumbnail_path.exists():
+            return None
+        doc_key = get_thumbnail_modified_key(pk)
+
+        cache_hit = cache.get(doc_key)
+        if cache_hit is not None:
+            cache.touch(doc_key, CACHE_50_MINUTES)
+            return cache_hit
+
+        # No cache, get the timestamp and cache the datetime
+        last_modified = datetime.fromtimestamp(
+            doc.thumbnail_path.stat().st_mtime,
+            tz=timezone.utc,
+        )
+        cache.set(doc_key, last_modified, CACHE_50_MINUTES)
+        return last_modified
+    except Document.DoesNotExist:  # pragma: no cover
+        return None
diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py
index 20dd64d82..d7ae1eeb7 100644
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@@ -4,6 +4,7 @@ import shutil
 import tempfile
 import uuid
 import zoneinfo
+from binascii import hexlify
 from datetime import timedelta
 from pathlib import Path
 from unittest import mock
@@ -13,12 +14,17 @@ from dateutil import parser
 from django.conf import settings
 from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
+from django.core.cache import cache
 from django.test import override_settings
 from django.utils import timezone
 from guardian.shortcuts import assign_perm
 from rest_framework import status
 from rest_framework.test import APITestCase
 
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import CLASSIFIER_MODIFIED_KEY
+from documents.caching import CLASSIFIER_VERSION_KEY
 from documents.models import Correspondent
 from documents.models import CustomField
 from documents.models import CustomFieldInstance
@@ -40,6 +46,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
 
         self.user = User.objects.create_superuser(username="temp_admin")
         self.client.force_authenticate(user=self.user)
+        cache.clear()
 
     def testDocuments(self):
         response = self.client.get("/api/documents/").data
@@ -1162,6 +1169,9 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
         self.assertEqual(meta["original_size"], os.stat(source_file).st_size)
         self.assertEqual(meta["archive_size"], os.stat(archive_file).st_size)
 
+        response = self.client.get(f"/api/documents/{doc.pk}/metadata/")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
     def test_get_metadata_invalid_doc(self):
         response = self.client.get("/api/documents/34576/metadata/")
         self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND)
@@ -1266,7 +1276,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
             },
         )
 
-    @mock.patch("documents.conditionals.pickle.load")
+    @mock.patch("documents.views.load_classifier")
     @mock.patch("documents.views.match_storage_paths")
     @mock.patch("documents.views.match_document_types")
     @mock.patch("documents.views.match_tags")
@@ -1278,7 +1288,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
         match_tags,
         match_document_types,
         match_storage_paths,
-        mocked_pickle_load,
+        mocked_load,
     ):
         """
         GIVEN:
@@ -1287,23 +1297,43 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
           - Classifier has not been modified
         THEN:
           - Subsequent requests are returned alright
-          - ETag and last modified are called
+          - ETag and last modified headers are set
         """
-        settings.MODEL_FILE.touch()
 
+        # setup the cache how the classifier does it
         from documents.classifier import DocumentClassifier
 
-        last_modified = timezone.now()
+        settings.MODEL_FILE.touch()
 
-        # ETag first, then modified
-        mock_effect = [
-            DocumentClassifier.FORMAT_VERSION,
-            "dont care",
-            b"thisisachecksum",
-            DocumentClassifier.FORMAT_VERSION,
-            last_modified,
+        classifier_checksum_bytes = b"thisisachecksum"
+        classifier_checksum_hex = hexlify(classifier_checksum_bytes).decode()
+
+        # Two loads, so two side effects
+        mocked_load.side_effect = [
+            mock.Mock(
+                last_auto_type_hash=classifier_checksum_bytes,
+                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+            ),
+            mock.Mock(
+                last_auto_type_hash=classifier_checksum_bytes,
+                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+            ),
         ]
-        mocked_pickle_load.side_effect = mock_effect
+
+        last_modified = timezone.now()
+        cache.set(CLASSIFIER_MODIFIED_KEY, last_modified, CACHE_50_MINUTES)
+        cache.set(CLASSIFIER_HASH_KEY, classifier_checksum_hex, CACHE_50_MINUTES)
+        cache.set(
+            CLASSIFIER_VERSION_KEY,
+            DocumentClassifier.FORMAT_VERSION,
+            CACHE_50_MINUTES,
+        )
+
+        # Mock the matching
+        match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
+        match_tags.return_value = [Tag(id=56), Tag(id=123)]
+        match_document_types.return_value = [DocumentType(id=23)]
+        match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]
 
         doc = Document.objects.create(
             title="test",
@@ -1311,12 +1341,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
             content="this is an invoice from 12.04.2022!",
         )
 
-        match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
-        match_tags.return_value = [Tag(id=56), Tag(id=123)]
-        match_document_types.return_value = [DocumentType(id=23)]
-        match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]
-
         response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
         self.assertEqual(
             response.data,
             {
@@ -1327,7 +1353,6 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
                 "dates": ["2022-04-12"],
             },
         )
-        mocked_pickle_load.assert_called()
         self.assertIn("Last-Modified", response.headers)
         self.assertEqual(
             response.headers["Last-Modified"],
@@ -1336,15 +1361,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
         self.assertIn("ETag", response.headers)
         self.assertEqual(
             response.headers["ETag"],
-            f"\"b'thisisachecksum':{settings.NUMBER_OF_SUGGESTED_DATES}\"",
+            f'"{classifier_checksum_hex}:{settings.NUMBER_OF_SUGGESTED_DATES}"',
         )
 
-        mocked_pickle_load.rest_mock()
-        mocked_pickle_load.side_effect = mock_effect
-
         response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
         self.assertEqual(response.status_code, status.HTTP_200_OK)
-        mocked_pickle_load.assert_called()
 
     @mock.patch("documents.parsers.parse_date_generator")
     @override_settings(NUMBER_OF_SUGGESTED_DATES=0)
diff --git a/src/documents/views.py b/src/documents/views.py
index 11fb5b1f2..0578cdb24 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -35,6 +35,7 @@ from django.utils.translation import get_language
 from django.views import View
 from django.views.decorators.cache import cache_control
 from django.views.decorators.http import condition
+from django.views.decorators.http import last_modified
 from django.views.generic import TemplateView
 from django_filters.rest_framework import DjangoFilterBackend
 from langdetect import detect
@@ -62,12 +63,21 @@ from documents import bulk_edit
 from documents.bulk_download import ArchiveOnlyStrategy
 from documents.bulk_download import OriginalAndArchiveStrategy
 from documents.bulk_download import OriginalsOnlyStrategy
+from documents.caching import CACHE_50_MINUTES
+from documents.caching import get_metadata_cache
+from documents.caching import get_suggestion_cache
+from documents.caching import refresh_metadata_cache
+from documents.caching import refresh_suggestions_cache
+from documents.caching import set_metadata_cache
+from documents.caching import set_suggestions_cache
 from documents.classifier import load_classifier
 from documents.conditionals import metadata_etag
 from documents.conditionals import metadata_last_modified
 from documents.conditionals import preview_etag
+from documents.conditionals import preview_last_modified
 from documents.conditionals import suggestions_etag
 from documents.conditionals import suggestions_last_modified
+from documents.conditionals import thumbnail_last_modified
 from documents.data_models import ConsumableDocument
 from documents.data_models import DocumentMetadataOverrides
 from documents.data_models import DocumentSource
@@ -379,10 +389,12 @@ class DocumentViewSet(
 
             try:
                 return parser.extract_metadata(file, mime_type)
-            except Exception:
+            except Exception:  # pragma: no cover
+                logger.exception(f"Issue getting metadata for {file}")
                 # TODO: cover GPG errors, remove later.
                 return []
-        else:
+        else:  # pragma: no cover
+            logger.warning(f"No parser for {mime_type}")
             return []
 
     def get_filesize(self, filename):
@@ -407,16 +419,37 @@ class DocumentViewSet(
         except Document.DoesNotExist:
             raise Http404
 
+        document_cached_metadata = get_metadata_cache(doc.pk)
+
+        archive_metadata = None
+        archive_filesize = None
+        if document_cached_metadata is not None:
+            original_metadata = document_cached_metadata.original_metadata
+            archive_metadata = document_cached_metadata.archive_metadata
+            refresh_metadata_cache(doc.pk)
+        else:
+            original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
+
+            if doc.has_archive_version:
+                archive_filesize = self.get_filesize(doc.archive_path)
+                archive_metadata = self.get_metadata(
+                    doc.archive_path,
+                    "application/pdf",
+                )
+            set_metadata_cache(doc, original_metadata, archive_metadata)
+
         meta = {
             "original_checksum": doc.checksum,
             "original_size": self.get_filesize(doc.source_path),
             "original_mime_type": doc.mime_type,
             "media_filename": doc.filename,
             "has_archive_version": doc.has_archive_version,
-            "original_metadata": self.get_metadata(doc.source_path, doc.mime_type),
+            "original_metadata": original_metadata,
             "archive_checksum": doc.archive_checksum,
             "archive_media_filename": doc.archive_filename,
             "original_filename": doc.original_filename,
+            "archive_size": archive_filesize,
+            "archive_metadata": archive_metadata,
         }
 
         lang = "en"
@@ -426,16 +459,6 @@ class DocumentViewSet(
             pass
         meta["lang"] = lang
 
-        if doc.has_archive_version:
-            meta["archive_size"] = self.get_filesize(doc.archive_path)
-            meta["archive_metadata"] = self.get_metadata(
-                doc.archive_path,
-                "application/pdf",
-            )
-        else:
-            meta["archive_size"] = None
-            meta["archive_metadata"] = None
-
         return Response(meta)
 
     @action(methods=["get"], detail=True)
@@ -454,6 +477,12 @@ class DocumentViewSet(
         ):
             return HttpResponseForbidden("Insufficient permissions")
 
+        document_suggestions = get_suggestion_cache(doc.pk)
+
+        if document_suggestions is not None:
+            refresh_suggestions_cache(doc.pk)
+            return Response(document_suggestions.suggestions)
+
         classifier = load_classifier()
 
         dates = []
@@ -463,27 +492,30 @@ class DocumentViewSet(
                 {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
             )
 
-        return Response(
-            {
-                "correspondents": [
-                    c.id for c in match_correspondents(doc, classifier, request.user)
-                ],
-                "tags": [t.id for t in match_tags(doc, classifier, request.user)],
-                "document_types": [
-                    dt.id for dt in match_document_types(doc, classifier, request.user)
-                ],
-                "storage_paths": [
-                    dt.id for dt in match_storage_paths(doc, classifier, request.user)
-                ],
-                "dates": [
-                    date.strftime("%Y-%m-%d") for date in dates if date is not None
-                ],
-            },
-        )
+        resp_data = {
+            "correspondents": [
+                c.id for c in match_correspondents(doc, classifier, request.user)
+            ],
+            "tags": [t.id for t in match_tags(doc, classifier, request.user)],
+            "document_types": [
+                dt.id for dt in match_document_types(doc, classifier, request.user)
+            ],
+            "storage_paths": [
+                dt.id for dt in match_storage_paths(doc, classifier, request.user)
+            ],
+            "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
+        }
+
+        # Cache the suggestions and the classifier hash for later
+        set_suggestions_cache(doc.pk, resp_data, classifier)
+
+        return Response(resp_data)
 
     @action(methods=["get"], detail=True)
     @method_decorator(cache_control(public=False, max_age=5 * 60))
-    @method_decorator(condition(etag_func=preview_etag))
+    @method_decorator(
+        condition(etag_func=preview_etag, last_modified_func=preview_last_modified),
+    )
     def preview(self, request, pk=None):
         try:
             response = self.file_response(pk, request, "inline")
@@ -492,7 +524,8 @@ class DocumentViewSet(
             raise Http404
 
     @action(methods=["get"], detail=True)
-    @method_decorator(cache_control(public=False, max_age=315360000))
+    @method_decorator(cache_control(public=False, max_age=CACHE_50_MINUTES))
+    @method_decorator(last_modified(thumbnail_last_modified))
     def thumb(self, request, pk=None):
         try:
             doc = Document.objects.get(id=pk)
@@ -506,8 +539,6 @@ class DocumentViewSet(
                 handle = GnuPG.decrypted(doc.thumbnail_file)
             else:
                 handle = doc.thumbnail_file
-            # TODO: Send ETag information and use that to send new thumbnails
-            #  if available
 
             return HttpResponse(handle, content_type="image/webp")
         except (FileNotFoundError, Document.DoesNotExist):
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 17ec2765d..7179f0358 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -762,8 +762,12 @@ CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")
 # django setting.
 CACHES = {
     "default": {
-        "BACKEND": "django.core.cache.backends.redis.RedisCache",
+        "BACKEND": os.environ.get(
+            "PAPERLESS_CACHE_BACKEND",
+            "django.core.cache.backends.redis.RedisCache",
+        ),
         "LOCATION": _CHANNELS_REDIS_URL,
+        "KEY_PREFIX": os.getenv("PAPERLESS_REDIS_PREFIX", ""),
     },
 }
 
diff --git a/src/setup.cfg b/src/setup.cfg
index dc5e9e33a..1877cb16e 100644
--- a/src/setup.cfg
+++ b/src/setup.cfg
@@ -3,6 +3,7 @@ DJANGO_SETTINGS_MODULE = paperless.settings
 addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
 env =
     PAPERLESS_DISABLE_DBHANDLER=true
+    PAPERLESS_CACHE_BACKEND=django.core.cache.backends.locmem.LocMemCache
 
 [coverage:run]
 source =