Compare commits

...

7 Commits

Author SHA1 Message Date
shamoon
7ea4893e42 Try joblib 2025-08-31 14:18:31 -07:00
shamoon
78255d0a99 Add some memory logging to suggestions (revert) 2025-08-31 14:07:48 -07:00
shamoon
fc4cb08bda Cache classifier loading with lru_cache 2025-08-30 20:06:18 -07:00
shamoon
875dc6602b Dont fetch doc content on suggestions 2025-08-30 20:06:18 -07:00
shamoon
8084ece274 Just build docker image in ci (revert me too) 2025-08-30 20:06:17 -07:00
shamoon
70b24c056b Add process to middleware (revert me too) 2025-08-30 20:06:17 -07:00
shamoon
26d2d63c26 MemLogMiddleware for testing (revert me) 2025-08-30 19:03:27 -07:00
7 changed files with 688 additions and 537 deletions

1008
.github/workflows/ci.yml vendored

File diff suppressed because it is too large Load Diff

View File

@@ -53,6 +53,7 @@ dependencies = [
"ocrmypdf~=16.10.0", "ocrmypdf~=16.10.0",
"pathvalidate~=3.3.1", "pathvalidate~=3.3.1",
"pdf2image~=1.17.0", "pdf2image~=1.17.0",
"psutil>=7",
"psycopg-pool", "psycopg-pool",
"python-dateutil~=2.9.0", "python-dateutil~=2.9.0",
"python-dotenv~=1.1.0", "python-dotenv~=1.1.0",

View File

@@ -4,6 +4,7 @@ import logging
import pickle import pickle
import re import re
import warnings import warnings
from functools import lru_cache
from hashlib import sha256 from hashlib import sha256
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@@ -50,6 +51,7 @@ class ClassifierModelCorruptError(Exception):
pass pass
@lru_cache(maxsize=1)
def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None: def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | None:
if not settings.MODEL_FILE.is_file(): if not settings.MODEL_FILE.is_file():
logger.debug( logger.debug(
@@ -61,6 +63,11 @@ def load_classifier(*, raise_exception: bool = False) -> DocumentClassifier | No
classifier = DocumentClassifier() classifier = DocumentClassifier()
try: try:
classifier.load() classifier.load()
logger.debug("classifier_id=%s", id(classifier))
logger.debug(
"classifier_data_vectorizer_hash=%s",
classifier.data_vectorizer_hash,
)
except IncompatibleClassifierVersionError as e: except IncompatibleClassifierVersionError as e:
logger.info(f"Classifier version incompatible: {e.message}, will re-train") logger.info(f"Classifier version incompatible: {e.message}, will re-train")
@@ -96,7 +103,8 @@ class DocumentClassifier:
# v7 - Updated scikit-learn package version # v7 - Updated scikit-learn package version
# v8 - Added storage path classifier # v8 - Added storage path classifier
# v9 - Changed from hashing to time/ids for re-train check # v9 - Changed from hashing to time/ids for re-train check
FORMAT_VERSION = 9 # v10 - Switch persistence to joblib with memory-mapping to reduce load-time memory spikes
FORMAT_VERSION = 10
def __init__(self) -> None: def __init__(self) -> None:
# last time a document changed and therefore training might be required # last time a document changed and therefore training might be required
@@ -128,32 +136,51 @@ class DocumentClassifier:
).hexdigest() ).hexdigest()
def load(self) -> None: def load(self) -> None:
import joblib
from sklearn.exceptions import InconsistentVersionWarning from sklearn.exceptions import InconsistentVersionWarning
# Catch warnings for processing # Catch warnings for processing
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
with Path(settings.MODEL_FILE).open("rb") as f: try:
schema_version = pickle.load(f) state = joblib.load(settings.MODEL_FILE, mmap_mode="r")
except Exception as err:
# As a fallback, try to detect old pickle-based and mark incompatible
try:
with Path(settings.MODEL_FILE).open("rb") as f:
_ = pickle.load(f)
raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.",
) from err
except IncompatibleClassifierVersionError:
raise
except Exception:
# Not even a readable pickle header
raise ClassifierModelCorruptError from err
if schema_version != self.FORMAT_VERSION: try:
if (
not isinstance(state, dict)
or state.get("format_version") != self.FORMAT_VERSION
):
raise IncompatibleClassifierVersionError( raise IncompatibleClassifierVersionError(
"Cannot load classifier, incompatible versions.", "Cannot load classifier, incompatible versions.",
) )
else:
try:
self.last_doc_change_time = pickle.load(f)
self.last_auto_type_hash = pickle.load(f)
self.data_vectorizer = pickle.load(f) self.last_doc_change_time = state.get("last_doc_change_time")
self._update_data_vectorizer_hash() self.last_auto_type_hash = state.get("last_auto_type_hash")
self.tags_binarizer = pickle.load(f)
self.tags_classifier = pickle.load(f) self.data_vectorizer = state.get("data_vectorizer")
self.correspondent_classifier = pickle.load(f) self._update_data_vectorizer_hash()
self.document_type_classifier = pickle.load(f) self.tags_binarizer = state.get("tags_binarizer")
self.storage_path_classifier = pickle.load(f)
except Exception as err: self.tags_classifier = state.get("tags_classifier")
raise ClassifierModelCorruptError from err self.correspondent_classifier = state.get("correspondent_classifier")
self.document_type_classifier = state.get("document_type_classifier")
self.storage_path_classifier = state.get("storage_path_classifier")
except IncompatibleClassifierVersionError:
raise
except Exception as err:
raise ClassifierModelCorruptError from err
# Check for the warning about unpickling from differing versions # Check for the warning about unpickling from differing versions
# and consider it incompatible # and consider it incompatible
@@ -171,23 +198,24 @@ class DocumentClassifier:
raise IncompatibleClassifierVersionError("sklearn version update") raise IncompatibleClassifierVersionError("sklearn version update")
def save(self) -> None: def save(self) -> None:
import joblib
target_file: Path = settings.MODEL_FILE target_file: Path = settings.MODEL_FILE
target_file_temp: Path = target_file.with_suffix(".pickle.part") target_file_temp: Path = target_file.with_suffix(".joblib.part")
with target_file_temp.open("wb") as f: state = {
pickle.dump(self.FORMAT_VERSION, f) "format_version": self.FORMAT_VERSION,
"last_doc_change_time": self.last_doc_change_time,
"last_auto_type_hash": self.last_auto_type_hash,
"data_vectorizer": self.data_vectorizer,
"tags_binarizer": self.tags_binarizer,
"tags_classifier": self.tags_classifier,
"correspondent_classifier": self.correspondent_classifier,
"document_type_classifier": self.document_type_classifier,
"storage_path_classifier": self.storage_path_classifier,
}
pickle.dump(self.last_doc_change_time, f) joblib.dump(state, target_file_temp, compress=3)
pickle.dump(self.last_auto_type_hash, f)
pickle.dump(self.data_vectorizer, f)
pickle.dump(self.tags_binarizer, f)
pickle.dump(self.tags_classifier, f)
pickle.dump(self.correspondent_classifier, f)
pickle.dump(self.document_type_classifier, f)
pickle.dump(self.storage_path_classifier, f)
target_file_temp.rename(target_file) target_file_temp.rename(target_file)

View File

@@ -3,7 +3,9 @@ import logging
import os import os
import platform import platform
import re import re
import resource
import tempfile import tempfile
import time
import zipfile import zipfile
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@@ -190,6 +192,33 @@ if settings.AUDIT_LOG_ENABLED:
logger = logging.getLogger("paperless.api") logger = logging.getLogger("paperless.api")
try:
import psutil
_PS = psutil.Process(os.getpid())
except Exception:
_PS = None
_diag_log = logging.getLogger("paperless")
def _mem_mb():
rss = _PS.memory_info().rss if _PS else 0
peak_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
return rss / (1024 * 1024), peak_kb / 1024.0
def _mark(phase, doc_id, t0):
rss, peak = _mem_mb()
_diag_log.debug(
"sugg doc=%s phase=%s rss=%.1fMB peak=%.1fMB t=%.1fms",
doc_id,
phase,
rss,
peak,
(time.perf_counter() - t0) * 1000,
)
class IndexView(TemplateView): class IndexView(TemplateView):
template_name = "index.html" template_name = "index.html"
@@ -758,7 +787,16 @@ class DocumentViewSet(
), ),
) )
def suggestions(self, request, pk=None): def suggestions(self, request, pk=None):
doc = get_object_or_404(Document.objects.select_related("owner"), pk=pk) t0 = time.perf_counter()
# Don't fetch content here
doc = get_object_or_404(
Document.objects.select_related("owner").only(
"id",
"owner_id",
),
pk=pk,
)
_mark("start", doc.pk, t0)
if request.user is not None and not has_perms_owner_aware( if request.user is not None and not has_perms_owner_aware(
request.user, request.user,
"view_document", "view_document",
@@ -769,18 +807,23 @@ class DocumentViewSet(
document_suggestions = get_suggestion_cache(doc.pk) document_suggestions = get_suggestion_cache(doc.pk)
if document_suggestions is not None: if document_suggestions is not None:
_mark("cache_hit_return", doc.pk, t0)
refresh_suggestions_cache(doc.pk) refresh_suggestions_cache(doc.pk)
return Response(document_suggestions.suggestions) return Response(document_suggestions.suggestions)
classifier = load_classifier() classifier = load_classifier()
_mark("loaded_classifier", doc.pk, t0)
dates = [] dates = []
if settings.NUMBER_OF_SUGGESTED_DATES > 0: if settings.NUMBER_OF_SUGGESTED_DATES > 0:
gen = parse_date_generator(doc.filename, doc.content) gen = parse_date_generator(doc.filename, doc.content)
_mark("before_dates", doc.pk, t0)
dates = sorted( dates = sorted(
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
) )
_mark("after_dates", doc.pk, t0)
_mark("before_match", doc.pk, t0)
resp_data = { resp_data = {
"correspondents": [ "correspondents": [
c.id for c in match_correspondents(doc, classifier, request.user) c.id for c in match_correspondents(doc, classifier, request.user)
@@ -794,9 +837,11 @@ class DocumentViewSet(
], ],
"dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None], "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
} }
_mark("assembled_resp", doc.pk, t0)
# Cache the suggestions and the classifier hash for later # Cache the suggestions and the classifier hash for later
set_suggestions_cache(doc.pk, resp_data, classifier) set_suggestions_cache(doc.pk, resp_data, classifier)
_mark("cached", doc.pk, t0)
return Response(resp_data) return Response(resp_data)

View File

@@ -1,7 +1,14 @@
import logging
import os
import resource
import time
from django.conf import settings from django.conf import settings
from paperless import version from paperless import version
logger = logging.getLogger("middleware")
class ApiVersionMiddleware: class ApiVersionMiddleware:
def __init__(self, get_response): def __init__(self, get_response):
@@ -15,3 +22,56 @@ class ApiVersionMiddleware:
response["X-Version"] = version.__full_version_str__ response["X-Version"] = version.__full_version_str__
return response return response
try:
import psutil
_PSUTIL = True
except Exception:
_PSUTIL = False
class MemLogMiddleware:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
# capture baseline
ru_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
if _PSUTIL:
p = psutil.Process()
rss_before = p.memory_info().rss
else:
rss_before = 0
t0 = time.perf_counter()
try:
return self.get_response(request)
finally:
dur_ms = (time.perf_counter() - t0) * 1000.0
ru_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
# ru_maxrss is KB on Linux; convert to MB
peak_mb = (ru_after) / 1024.0
peak_delta_mb = (ru_after - ru_before) / 1024.0
if _PSUTIL:
rss_after = p.memory_info().rss
delta_mb = (rss_after - rss_before) / (1024 * 1024)
rss_mb = rss_after / (1024 * 1024)
else:
delta_mb = 0.0
rss_mb = 0.0
logger.debug(
"pid=%s mem rss=%.1fMB Δend=%.1fMB peak=%.1fMB Δpeak=%.1fMB dur=%.1fms %s %s",
os.getpid(),
rss_mb,
delta_mb,
peak_mb,
peak_delta_mb,
dur_ms,
request.method,
request.path,
)

View File

@@ -363,6 +363,7 @@ if DEBUG:
) )
MIDDLEWARE = [ MIDDLEWARE = [
"paperless.middleware.MemLogMiddleware",
"django.middleware.security.SecurityMiddleware", "django.middleware.security.SecurityMiddleware",
"whitenoise.middleware.WhiteNoiseMiddleware", "whitenoise.middleware.WhiteNoiseMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware", "django.contrib.sessions.middleware.SessionMiddleware",
@@ -833,7 +834,7 @@ LOGGING = {
"disable_existing_loggers": False, "disable_existing_loggers": False,
"formatters": { "formatters": {
"verbose": { "verbose": {
"format": "[{asctime}] [{levelname}] [{name}] {message}", "format": "[{asctime}] [{levelname}] [{name}] pid={process} {message}",
"style": "{", "style": "{",
}, },
"simple": { "simple": {
@@ -878,6 +879,7 @@ LOGGING = {
"kombu": {"handlers": ["file_celery"], "level": "DEBUG"}, "kombu": {"handlers": ["file_celery"], "level": "DEBUG"},
"_granian": {"handlers": ["file_paperless"], "level": "DEBUG"}, "_granian": {"handlers": ["file_paperless"], "level": "DEBUG"},
"granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"}, "granian.access": {"handlers": ["file_paperless"], "level": "DEBUG"},
"middleware": {"handlers": ["console"], "level": "DEBUG"},
}, },
} }

15
uv.lock generated
View File

@@ -2046,6 +2046,7 @@ dependencies = [
{ name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "psutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2182,6 +2183,7 @@ requires-dist = [
{ name = "ocrmypdf", specifier = "~=16.10.0" }, { name = "ocrmypdf", specifier = "~=16.10.0" },
{ name = "pathvalidate", specifier = "~=3.3.1" }, { name = "pathvalidate", specifier = "~=3.3.1" },
{ name = "pdf2image", specifier = "~=1.17.0" }, { name = "pdf2image", specifier = "~=1.17.0" },
{ name = "psutil", specifier = ">=7.0.0" },
{ name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" }, { name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" },
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" }, { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" },
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" }, { name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" },
@@ -2548,6 +2550,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" }, { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816, upload-time = "2025-01-20T15:55:29.98Z" },
] ]
[[package]]
name = "psutil"
version = "7.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/2a/80/336820c1ad9286a4ded7e845b2eccfcb27851ab8ac6abece774a6ff4d3de/psutil-7.0.0.tar.gz", hash = "sha256:7be9c3eba38beccb6495ea33afd982a44074b78f28c434a1f51cc07fd315c456", size = 497003, upload-time = "2025-02-13T21:54:07.946Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ed/e6/2d26234410f8b8abdbf891c9da62bee396583f713fb9f3325a4760875d22/psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25", size = 238051, upload-time = "2025-02-13T21:54:12.36Z" },
{ url = "https://files.pythonhosted.org/packages/04/8b/30f930733afe425e3cbfc0e1468a30a18942350c1a8816acfade80c005c4/psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da", size = 239535, upload-time = "2025-02-13T21:54:16.07Z" },
{ url = "https://files.pythonhosted.org/packages/2a/ed/d362e84620dd22876b55389248e522338ed1bf134a5edd3b8231d7207f6d/psutil-7.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fcee592b4c6f146991ca55919ea3d1f8926497a713ed7faaf8225e174581e91", size = 275004, upload-time = "2025-02-13T21:54:18.662Z" },
{ url = "https://files.pythonhosted.org/packages/bf/b9/b0eb3f3cbcb734d930fdf839431606844a825b23eaf9a6ab371edac8162c/psutil-7.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b1388a4f6875d7e2aff5c4ca1cc16c545ed41dd8bb596cefea80111db353a34", size = 277986, upload-time = "2025-02-13T21:54:21.811Z" },
{ url = "https://files.pythonhosted.org/packages/eb/a2/709e0fe2f093556c17fbafda93ac032257242cabcc7ff3369e2cb76a97aa/psutil-7.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5f098451abc2828f7dc6b58d44b532b22f2088f4999a937557b603ce72b1993", size = 279544, upload-time = "2025-02-13T21:54:24.68Z" },
]
[[package]] [[package]]
name = "psycopg" name = "psycopg"
version = "3.2.9" version = "3.2.9"