Merge caching

2025-09-16 21:55:37 -05:00 · 2025-04-08 16:36:31 -07:00
parent b92651aad2
commit 730636f38e
7 changed files with 22 additions and 22 deletions
--- a/src/paperless/caching.py
+++ b/src/paperless/caching.py
@@ -0,0 +1,215 @@
+from __future__ import annotations
+
+import logging
+from binascii import hexlify
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from typing import Final
+
+from django.core.cache import cache
+
+from paperless.models import Document
+
+if TYPE_CHECKING:
+    from paperless.classifier import DocumentClassifier
+
+logger = logging.getLogger("paperless.caching")
+
+
+@dataclass(frozen=True)
+class MetadataCacheData:
+    original_checksum: str
+    original_metadata: list
+    archive_checksum: str | None
+    archive_metadata: list | None
+
+
+@dataclass(frozen=True)
+class SuggestionCacheData:
+    classifier_version: int
+    classifier_hash: str
+    suggestions: dict
+
+
+CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
+CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
+CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
+
+CACHE_1_MINUTE: Final[int] = 60
+CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
+CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
+
+
+def get_suggestion_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's suggestions
+    """
+    return f"doc_{document_id}_suggest"
+
+
+def get_suggestion_cache(document_id: int) -> SuggestionCacheData | None:
+    """
+    If possible, return the cached suggestions for the given document ID.
+    The classifier needs to be matching in format and hash and the suggestions need to
+    have been cached once.
+    """
+    from paperless.classifier import DocumentClassifier
+
+    doc_key = get_suggestion_cache_key(document_id)
+    cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
+    # The document suggestions are in the cache
+    if doc_key in cache_hits:
+        doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
+        # The classifier format is the same
+        # The classifier hash is the same
+        # Then the suggestions can be used
+        if (
+            CLASSIFIER_VERSION_KEY in cache_hits
+            and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
+            and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
+        ) and (
+            CLASSIFIER_HASH_KEY in cache_hits
+            and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
+        ):
+            return doc_suggestions
+        else:  # pragma: no cover
+            # Remove the key because something didn't match
+            cache.delete(doc_key)
+    return None
+
+
+def set_suggestions_cache(
+    document_id: int,
+    suggestions: dict,
+    classifier: DocumentClassifier | None,
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Caches the given suggestions, which were generated by the given classifier.  If there is no classifier,
+    this function is a no-op (there won't be suggestions then anyway)
+    """
+    if classifier is not None:
+        doc_key = get_suggestion_cache_key(document_id)
+        cache.set(
+            doc_key,
+            SuggestionCacheData(
+                classifier.FORMAT_VERSION,
+                hexlify(classifier.last_auto_type_hash).decode(),
+                suggestions,
+            ),
+            timeout,
+        )
+
+
+def refresh_suggestions_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the suggestions for the given document ID
+    to the given timeout
+    """
+    doc_key = get_suggestion_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_metadata_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's metadata
+    """
+    return f"doc_{document_id}_metadata"
+
+
+def get_metadata_cache(document_id: int) -> MetadataCacheData | None:
+    """
+    Returns the cached document metadata for the given document ID, as long as the metadata
+    was cached once and the checksums have not changed
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    doc_metadata: MetadataCacheData | None = cache.get(doc_key)
+    # The metadata exists in the cache
+    if doc_metadata is not None:
+        try:
+            doc = Document.objects.only(
+                "pk",
+                "checksum",
+                "archive_checksum",
+                "archive_filename",
+            ).get(pk=document_id)
+            # The original checksums match
+            # If it has one, the archive checksums match
+            # Then, we can use the metadata
+            if (
+                doc_metadata.original_checksum == doc.checksum
+                and doc.has_archive_version
+                and doc_metadata.archive_checksum is not None
+                and doc_metadata.archive_checksum == doc.archive_checksum
+            ):
+                # Refresh cache
+                cache.touch(doc_key, CACHE_50_MINUTES)
+                return doc_metadata
+            else:  # pragma: no cover
+                # Something didn't match, delete the key
+                cache.delete(doc_key)
+        except Document.DoesNotExist:  # pragma: no cover
+            # Basically impossible, but the key existed, but the Document didn't
+            cache.delete(doc_key)
+    return None
+
+
+def set_metadata_cache(
+    document: Document,
+    original_metadata: list,
+    archive_metadata: list | None,
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Sets the metadata into cache for the given Document
+    """
+    doc_key = get_metadata_cache_key(document.pk)
+    cache.set(
+        doc_key,
+        MetadataCacheData(
+            document.checksum,
+            original_metadata,
+            document.archive_checksum,
+            archive_metadata,
+        ),
+        timeout,
+    )
+
+
+def refresh_metadata_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the metadata for the given document ID
+    to the given timeout
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_thumbnail_modified_key(document_id: int) -> str:
+    """
+    Builds the key to store a thumbnail's timestamp
+    """
+    return f"doc_{document_id}_thumbnail_modified"
+
+
+def clear_document_caches(document_id: int) -> None:
+    """
+    Removes all cached items for the given document
+    """
+    cache.delete_many(
+        [
+            get_suggestion_cache_key(document_id),
+            get_metadata_cache_key(document_id),
+            get_thumbnail_modified_key(document_id),
+        ],
+    )
--- a/src/paperless/classifier.py
+++ b/src/paperless/classifier.py
@@ -17,10 +17,10 @@ if TYPE_CHECKING:
 from django.conf import settings
 from django.core.cache import cache

-from documents.caching import CACHE_50_MINUTES
-from documents.caching import CLASSIFIER_HASH_KEY
-from documents.caching import CLASSIFIER_MODIFIED_KEY
-from documents.caching import CLASSIFIER_VERSION_KEY
+from paperless.caching import CACHE_50_MINUTES
+from paperless.caching import CLASSIFIER_HASH_KEY
+from paperless.caching import CLASSIFIER_MODIFIED_KEY
+from paperless.caching import CLASSIFIER_VERSION_KEY
 from paperless.models import Document
 from paperless.models import MatchingModel

--- a/src/paperless/conditionals.py
+++ b/src/paperless/conditionals.py
@@ -4,12 +4,12 @@ from datetime import timezone
 from django.conf import settings
 from django.core.cache import cache

-from documents.caching import CACHE_5_MINUTES
-from documents.caching import CACHE_50_MINUTES
-from documents.caching import CLASSIFIER_HASH_KEY
-from documents.caching import CLASSIFIER_MODIFIED_KEY
-from documents.caching import CLASSIFIER_VERSION_KEY
-from documents.caching import get_thumbnail_modified_key
+from paperless.caching import CACHE_5_MINUTES
+from paperless.caching import CACHE_50_MINUTES
+from paperless.caching import CLASSIFIER_HASH_KEY
+from paperless.caching import CLASSIFIER_MODIFIED_KEY
+from paperless.caching import CLASSIFIER_VERSION_KEY
+from paperless.caching import get_thumbnail_modified_key
 from paperless.classifier import DocumentClassifier
 from paperless.models import Document

--- a/src/paperless/views.py
+++ b/src/paperless/views.py
@@ -89,12 +89,6 @@ from rest_framework.viewsets import ModelViewSet
 from rest_framework.viewsets import ReadOnlyModelViewSet
 from rest_framework.viewsets import ViewSet

-from documents.caching import get_metadata_cache
-from documents.caching import get_suggestion_cache
-from documents.caching import refresh_metadata_cache
-from documents.caching import refresh_suggestions_cache
-from documents.caching import set_metadata_cache
-from documents.caching import set_suggestions_cache
 from documents.filters import CorrespondentFilterSet
 from documents.filters import CustomFieldFilterSet
 from documents.filters import DocumentFilterSet
@@ -121,6 +115,12 @@ from paperless import version
 from paperless.bulk_download import ArchiveOnlyStrategy
 from paperless.bulk_download import OriginalAndArchiveStrategy
 from paperless.bulk_download import OriginalsOnlyStrategy
+from paperless.caching import get_metadata_cache
+from paperless.caching import get_suggestion_cache
+from paperless.caching import refresh_metadata_cache
+from paperless.caching import refresh_suggestions_cache
+from paperless.caching import set_metadata_cache
+from paperless.caching import set_suggestions_cache
 from paperless.celery import app as celery_app
 from paperless.classifier import load_classifier
 from paperless.conditionals import metadata_etag