Merge caching

This commit is contained in:
shamoon
2025-04-08 16:36:31 -07:00
parent b92651aad2
commit 730636f38e
7 changed files with 22 additions and 22 deletions

215
src/paperless/caching.py Normal file
View File

@@ -0,0 +1,215 @@
from __future__ import annotations
import logging
from binascii import hexlify
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Final
from django.core.cache import cache
from paperless.models import Document
if TYPE_CHECKING:
from paperless.classifier import DocumentClassifier
logger = logging.getLogger("paperless.caching")
@dataclass(frozen=True)
class MetadataCacheData:
original_checksum: str
original_metadata: list
archive_checksum: str | None
archive_metadata: list | None
@dataclass(frozen=True)
class SuggestionCacheData:
classifier_version: int
classifier_hash: str
suggestions: dict
CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
CACHE_1_MINUTE: Final[int] = 60
CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
def get_suggestion_cache_key(document_id: int) -> str:
"""
Returns the basic key for a document's suggestions
"""
return f"doc_{document_id}_suggest"
def get_suggestion_cache(document_id: int) -> SuggestionCacheData | None:
"""
If possible, return the cached suggestions for the given document ID.
The classifier needs to be matching in format and hash and the suggestions need to
have been cached once.
"""
from paperless.classifier import DocumentClassifier
doc_key = get_suggestion_cache_key(document_id)
cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
# The document suggestions are in the cache
if doc_key in cache_hits:
doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
# The classifier format is the same
# The classifier hash is the same
# Then the suggestions can be used
if (
CLASSIFIER_VERSION_KEY in cache_hits
and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
) and (
CLASSIFIER_HASH_KEY in cache_hits
and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
):
return doc_suggestions
else: # pragma: no cover
# Remove the key because something didn't match
cache.delete(doc_key)
return None
def set_suggestions_cache(
document_id: int,
suggestions: dict,
classifier: DocumentClassifier | None,
*,
timeout=CACHE_50_MINUTES,
) -> None:
"""
Caches the given suggestions, which were generated by the given classifier. If there is no classifier,
this function is a no-op (there won't be suggestions then anyway)
"""
if classifier is not None:
doc_key = get_suggestion_cache_key(document_id)
cache.set(
doc_key,
SuggestionCacheData(
classifier.FORMAT_VERSION,
hexlify(classifier.last_auto_type_hash).decode(),
suggestions,
),
timeout,
)
def refresh_suggestions_cache(
document_id: int,
*,
timeout: int = CACHE_50_MINUTES,
) -> None:
"""
Refreshes the expiration of the suggestions for the given document ID
to the given timeout
"""
doc_key = get_suggestion_cache_key(document_id)
cache.touch(doc_key, timeout)
def get_metadata_cache_key(document_id: int) -> str:
"""
Returns the basic key for a document's metadata
"""
return f"doc_{document_id}_metadata"
def get_metadata_cache(document_id: int) -> MetadataCacheData | None:
"""
Returns the cached document metadata for the given document ID, as long as the metadata
was cached once and the checksums have not changed
"""
doc_key = get_metadata_cache_key(document_id)
doc_metadata: MetadataCacheData | None = cache.get(doc_key)
# The metadata exists in the cache
if doc_metadata is not None:
try:
doc = Document.objects.only(
"pk",
"checksum",
"archive_checksum",
"archive_filename",
).get(pk=document_id)
# The original checksums match
# If it has one, the archive checksums match
# Then, we can use the metadata
if (
doc_metadata.original_checksum == doc.checksum
and doc.has_archive_version
and doc_metadata.archive_checksum is not None
and doc_metadata.archive_checksum == doc.archive_checksum
):
# Refresh cache
cache.touch(doc_key, CACHE_50_MINUTES)
return doc_metadata
else: # pragma: no cover
# Something didn't match, delete the key
cache.delete(doc_key)
except Document.DoesNotExist: # pragma: no cover
# Basically impossible, but the key existed, but the Document didn't
cache.delete(doc_key)
return None
def set_metadata_cache(
document: Document,
original_metadata: list,
archive_metadata: list | None,
*,
timeout=CACHE_50_MINUTES,
) -> None:
"""
Sets the metadata into cache for the given Document
"""
doc_key = get_metadata_cache_key(document.pk)
cache.set(
doc_key,
MetadataCacheData(
document.checksum,
original_metadata,
document.archive_checksum,
archive_metadata,
),
timeout,
)
def refresh_metadata_cache(
document_id: int,
*,
timeout: int = CACHE_50_MINUTES,
) -> None:
"""
Refreshes the expiration of the metadata for the given document ID
to the given timeout
"""
doc_key = get_metadata_cache_key(document_id)
cache.touch(doc_key, timeout)
def get_thumbnail_modified_key(document_id: int) -> str:
"""
Builds the key to store a thumbnail's timestamp
"""
return f"doc_{document_id}_thumbnail_modified"
def clear_document_caches(document_id: int) -> None:
"""
Removes all cached items for the given document
"""
cache.delete_many(
[
get_suggestion_cache_key(document_id),
get_metadata_cache_key(document_id),
get_thumbnail_modified_key(document_id),
],
)

View File

@@ -17,10 +17,10 @@ if TYPE_CHECKING:
from django.conf import settings
from django.core.cache import cache
from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from paperless.caching import CACHE_50_MINUTES
from paperless.caching import CLASSIFIER_HASH_KEY
from paperless.caching import CLASSIFIER_MODIFIED_KEY
from paperless.caching import CLASSIFIER_VERSION_KEY
from paperless.models import Document
from paperless.models import MatchingModel

View File

@@ -4,12 +4,12 @@ from datetime import timezone
from django.conf import settings
from django.core.cache import cache
from documents.caching import CACHE_5_MINUTES
from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from documents.caching import get_thumbnail_modified_key
from paperless.caching import CACHE_5_MINUTES
from paperless.caching import CACHE_50_MINUTES
from paperless.caching import CLASSIFIER_HASH_KEY
from paperless.caching import CLASSIFIER_MODIFIED_KEY
from paperless.caching import CLASSIFIER_VERSION_KEY
from paperless.caching import get_thumbnail_modified_key
from paperless.classifier import DocumentClassifier
from paperless.models import Document

View File

@@ -89,12 +89,6 @@ from rest_framework.viewsets import ModelViewSet
from rest_framework.viewsets import ReadOnlyModelViewSet
from rest_framework.viewsets import ViewSet
from documents.caching import get_metadata_cache
from documents.caching import get_suggestion_cache
from documents.caching import refresh_metadata_cache
from documents.caching import refresh_suggestions_cache
from documents.caching import set_metadata_cache
from documents.caching import set_suggestions_cache
from documents.filters import CorrespondentFilterSet
from documents.filters import CustomFieldFilterSet
from documents.filters import DocumentFilterSet
@@ -121,6 +115,12 @@ from paperless import version
from paperless.bulk_download import ArchiveOnlyStrategy
from paperless.bulk_download import OriginalAndArchiveStrategy
from paperless.bulk_download import OriginalsOnlyStrategy
from paperless.caching import get_metadata_cache
from paperless.caching import get_suggestion_cache
from paperless.caching import refresh_metadata_cache
from paperless.caching import refresh_suggestions_cache
from paperless.caching import set_metadata_cache
from paperless.caching import set_suggestions_cache
from paperless.celery import app as celery_app
from paperless.classifier import load_classifier
from paperless.conditionals import metadata_etag