Merge caching

2025-09-03 01:56:16 +00:00 · 2025-04-08 16:36:31 -07:00
parent b92651aad2
commit 730636f38e
7 changed files with 22 additions and 22 deletions
--- a/src/documents/caching.py
+++ b/src/documents/caching.py
@@ -1,215 +0,0 @@
-from __future__ import annotations
-
-import logging
-from binascii import hexlify
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
-from typing import Final
-
-from django.core.cache import cache
-
-from paperless.models import Document
-
-if TYPE_CHECKING:
-    from paperless.classifier import DocumentClassifier
-
-logger = logging.getLogger("paperless.caching")
-
-
-@dataclass(frozen=True)
-class MetadataCacheData:
-    original_checksum: str
-    original_metadata: list
-    archive_checksum: str | None
-    archive_metadata: list | None
-
-
-@dataclass(frozen=True)
-class SuggestionCacheData:
-    classifier_version: int
-    classifier_hash: str
-    suggestions: dict
-
-
-CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
-CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
-CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
-
-CACHE_1_MINUTE: Final[int] = 60
-CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
-CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
-
-
-def get_suggestion_cache_key(document_id: int) -> str:
-    """
-    Returns the basic key for a document's suggestions
-    """
-    return f"doc_{document_id}_suggest"
-
-
-def get_suggestion_cache(document_id: int) -> SuggestionCacheData | None:
-    """
-    If possible, return the cached suggestions for the given document ID.
-    The classifier needs to be matching in format and hash and the suggestions need to
-    have been cached once.
-    """
-    from paperless.classifier import DocumentClassifier
-
-    doc_key = get_suggestion_cache_key(document_id)
-    cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
-    # The document suggestions are in the cache
-    if doc_key in cache_hits:
-        doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
-        # The classifier format is the same
-        # The classifier hash is the same
-        # Then the suggestions can be used
-        if (
-            CLASSIFIER_VERSION_KEY in cache_hits
-            and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
-            and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
-        ) and (
-            CLASSIFIER_HASH_KEY in cache_hits
-            and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
-        ):
-            return doc_suggestions
-        else:  # pragma: no cover
-            # Remove the key because something didn't match
-            cache.delete(doc_key)
-    return None
-
-
-def set_suggestions_cache(
-    document_id: int,
-    suggestions: dict,
-    classifier: DocumentClassifier | None,
-    *,
-    timeout=CACHE_50_MINUTES,
-) -> None:
-    """
-    Caches the given suggestions, which were generated by the given classifier.  If there is no classifier,
-    this function is a no-op (there won't be suggestions then anyway)
-    """
-    if classifier is not None:
-        doc_key = get_suggestion_cache_key(document_id)
-        cache.set(
-            doc_key,
-            SuggestionCacheData(
-                classifier.FORMAT_VERSION,
-                hexlify(classifier.last_auto_type_hash).decode(),
-                suggestions,
-            ),
-            timeout,
-        )
-
-
-def refresh_suggestions_cache(
-    document_id: int,
-    *,
-    timeout: int = CACHE_50_MINUTES,
-) -> None:
-    """
-    Refreshes the expiration of the suggestions for the given document ID
-    to the given timeout
-    """
-    doc_key = get_suggestion_cache_key(document_id)
-    cache.touch(doc_key, timeout)
-
-
-def get_metadata_cache_key(document_id: int) -> str:
-    """
-    Returns the basic key for a document's metadata
-    """
-    return f"doc_{document_id}_metadata"
-
-
-def get_metadata_cache(document_id: int) -> MetadataCacheData | None:
-    """
-    Returns the cached document metadata for the given document ID, as long as the metadata
-    was cached once and the checksums have not changed
-    """
-    doc_key = get_metadata_cache_key(document_id)
-    doc_metadata: MetadataCacheData | None = cache.get(doc_key)
-    # The metadata exists in the cache
-    if doc_metadata is not None:
-        try:
-            doc = Document.objects.only(
-                "pk",
-                "checksum",
-                "archive_checksum",
-                "archive_filename",
-            ).get(pk=document_id)
-            # The original checksums match
-            # If it has one, the archive checksums match
-            # Then, we can use the metadata
-            if (
-                doc_metadata.original_checksum == doc.checksum
-                and doc.has_archive_version
-                and doc_metadata.archive_checksum is not None
-                and doc_metadata.archive_checksum == doc.archive_checksum
-            ):
-                # Refresh cache
-                cache.touch(doc_key, CACHE_50_MINUTES)
-                return doc_metadata
-            else:  # pragma: no cover
-                # Something didn't match, delete the key
-                cache.delete(doc_key)
-        except Document.DoesNotExist:  # pragma: no cover
-            # Basically impossible, but the key existed, but the Document didn't
-            cache.delete(doc_key)
-    return None
-
-
-def set_metadata_cache(
-    document: Document,
-    original_metadata: list,
-    archive_metadata: list | None,
-    *,
-    timeout=CACHE_50_MINUTES,
-) -> None:
-    """
-    Sets the metadata into cache for the given Document
-    """
-    doc_key = get_metadata_cache_key(document.pk)
-    cache.set(
-        doc_key,
-        MetadataCacheData(
-            document.checksum,
-            original_metadata,
-            document.archive_checksum,
-            archive_metadata,
-        ),
-        timeout,
-    )
-
-
-def refresh_metadata_cache(
-    document_id: int,
-    *,
-    timeout: int = CACHE_50_MINUTES,
-) -> None:
-    """
-    Refreshes the expiration of the metadata for the given document ID
-    to the given timeout
-    """
-    doc_key = get_metadata_cache_key(document_id)
-    cache.touch(doc_key, timeout)
-
-
-def get_thumbnail_modified_key(document_id: int) -> str:
-    """
-    Builds the key to store a thumbnail's timestamp
-    """
-    return f"doc_{document_id}_thumbnail_modified"
-
-
-def clear_document_caches(document_id: int) -> None:
-    """
-    Removes all cached items for the given document
-    """
-    cache.delete_many(
-        [
-            get_suggestion_cache_key(document_id),
-            get_metadata_cache_key(document_id),
-            get_thumbnail_modified_key(document_id),
-        ],
-    )
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -23,10 +23,10 @@ from django.utils import timezone
 from filelock import FileLock
 from guardian.shortcuts import remove_perm

-from documents.caching import clear_document_caches
 from documents.mail import send_email
 from documents.templating.workflows import parse_w_workflow_placeholders
 from paperless import matching
+from paperless.caching import clear_document_caches
 from paperless.file_handling import create_source_path_directory
 from paperless.file_handling import delete_empty_directories
 from paperless.file_handling import generate_unique_filename
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -20,7 +20,6 @@ from filelock import FileLock
 from whoosh.writing import AsyncWriter

 from documents import sanity_checker
-from documents.caching import clear_document_caches
 from documents.plugins.base import ConsumeTaskPlugin
 from documents.plugins.base import ProgressManager
 from documents.plugins.base import StopConsumeTaskError
@@ -31,6 +30,7 @@ from documents.signals.handlers import cleanup_document_deletion
 from documents.signals.handlers import run_workflows
 from paperless import index
 from paperless.barcodes import BarcodePlugin
+from paperless.caching import clear_document_caches
 from paperless.classifier import DocumentClassifier
 from paperless.classifier import load_classifier
 from paperless.consumer import ConsumerPlugin
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@@ -24,13 +24,13 @@ from guardian.shortcuts import assign_perm
 from rest_framework import status
 from rest_framework.test import APITestCase

-from documents.caching import CACHE_50_MINUTES
-from documents.caching import CLASSIFIER_HASH_KEY
-from documents.caching import CLASSIFIER_MODIFIED_KEY
-from documents.caching import CLASSIFIER_VERSION_KEY
 from documents.signals.handlers import run_workflows
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import DocumentConsumeDelayMixin
+from paperless.caching import CACHE_50_MINUTES
+from paperless.caching import CLASSIFIER_HASH_KEY
+from paperless.caching import CLASSIFIER_MODIFIED_KEY
+from paperless.caching import CLASSIFIER_VERSION_KEY
 from paperless.data_models import DocumentSource
 from paperless.models import Correspondent
 from paperless.models import CustomField