Compare commits

..

2 Commits

Author SHA1 Message Date
shamoon
0886627aa8 Oops circular import 2026-01-21 15:31:44 -08:00
shamoon
65b47e86c3 Add LLM index update queuing and improve error handling 2026-01-21 12:57:24 -08:00
14 changed files with 54 additions and 366 deletions

View File

@@ -8,11 +8,6 @@ echo "${log_prefix} Apply database migrations..."
cd "${PAPERLESS_SRC_DIR}"
if [[ "${PAPERLESS_MIGRATION_MODE:-0}" == "1" ]]; then
echo "${log_prefix} Migration mode enabled, skipping migrations."
exit 0
fi
# The whole migrate, with flock, needs to run as the right user
if [[ -n "${USER_IS_NON_ROOT}" ]]; then
exec s6-setlock -n "${data_dir}/migration_lock" python3 manage.py migrate --skip-checks --no-input

View File

@@ -9,15 +9,7 @@ echo "${log_prefix} Running Django checks"
cd "${PAPERLESS_SRC_DIR}"
if [[ -n "${USER_IS_NON_ROOT}" ]]; then
if [[ "${PAPERLESS_MIGRATION_MODE:-0}" == "1" ]]; then
python3 manage_migration.py check
else
python3 manage.py check
fi
python3 manage.py check
else
if [[ "${PAPERLESS_MIGRATION_MODE:-0}" == "1" ]]; then
s6-setuidgid paperless python3 manage_migration.py check
else
s6-setuidgid paperless python3 manage.py check
fi
s6-setuidgid paperless python3 manage.py check
fi

View File

@@ -13,14 +13,8 @@ if [[ -n "${PAPERLESS_FORCE_SCRIPT_NAME}" ]]; then
export GRANIAN_URL_PATH_PREFIX=${PAPERLESS_FORCE_SCRIPT_NAME}
fi
if [[ "${PAPERLESS_MIGRATION_MODE:-0}" == "1" ]]; then
app_module="paperless.migration_asgi:application"
else
app_module="paperless.asgi:application"
fi
if [[ -n "${USER_IS_NON_ROOT}" ]]; then
exec granian --interface asginl --ws --loop uvloop "${app_module}"
exec granian --interface asginl --ws --loop uvloop "paperless.asgi:application"
else
exec s6-setuidgid paperless granian --interface asginl --ws --loop uvloop "${app_module}"
exec s6-setuidgid paperless granian --interface asginl --ws --loop uvloop "paperless.asgi:application"
fi

View File

@@ -1,13 +0,0 @@
#!/usr/bin/env python3
import os
import sys
if __name__ == "__main__":
os.environ.setdefault(
"DJANGO_SETTINGS_MODULE",
"paperless_migration.settings",
)
from django.core.management import execute_from_command_line
execute_from_command_line(sys.argv)

View File

@@ -1,7 +0,0 @@
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless_migration.settings")
application = get_asgi_application()

View File

@@ -1,11 +1,14 @@
import logging
import shutil
from datetime import timedelta
from pathlib import Path
import faiss
import llama_index.core.settings as llama_settings
import tqdm
from celery import states
from django.conf import settings
from django.utils import timezone
from llama_index.core import Document as LlamaDocument
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
@@ -21,6 +24,7 @@ from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from documents.models import Document
from documents.models import PaperlessTask
from paperless_ai.embedding import build_llm_index_text
from paperless_ai.embedding import get_embedding_dim
from paperless_ai.embedding import get_embedding_model
@@ -28,6 +32,29 @@ from paperless_ai.embedding import get_embedding_model
logger = logging.getLogger("paperless_ai.indexing")
def queue_llm_index_update_if_needed(*, rebuild: bool, reason: str) -> bool:
from documents.tasks import llmindex_index
has_running = PaperlessTask.objects.filter(
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
status__in=[states.PENDING, states.STARTED],
).exists()
has_recent = PaperlessTask.objects.filter(
task_name=PaperlessTask.TaskName.LLMINDEX_UPDATE,
date_created__gte=(timezone.now() - timedelta(minutes=5)),
).exists()
if has_running or has_recent:
return False
llmindex_index.delay(rebuild=rebuild, scheduled=False, auto=True)
logger.warning(
"Queued LLM index update%s: %s",
" (rebuild)" if rebuild else "",
reason,
)
return True
def get_or_create_storage_context(*, rebuild=False):
"""
Loads or creates the StorageContext (vector store, docstore, index store).
@@ -93,6 +120,10 @@ def load_or_build_index(nodes=None):
except ValueError as e:
logger.warning("Failed to load index from storage: %s", e)
if not nodes:
queue_llm_index_update_if_needed(
rebuild=vector_store_file_exists(),
reason="LLM index missing or invalid while loading.",
)
logger.info("No nodes provided for index creation.")
raise
return VectorStoreIndex(
@@ -250,7 +281,21 @@ def query_similar_documents(
"""
Runs a similarity query and returns top-k similar Document objects.
"""
index = load_or_build_index()
if not vector_store_file_exists():
queue_llm_index_update_if_needed(
rebuild=False,
reason="LLM index not found for similarity query.",
)
return []
try:
index = load_or_build_index()
except ValueError:
queue_llm_index_update_if_needed(
rebuild=True,
reason="LLM index failed to load for similarity query.",
)
return []
# constrain only the node(s) that match the document IDs, if given
doc_node_ids = (

View File

@@ -299,11 +299,15 @@ def test_query_similar_documents(
with (
patch("paperless_ai.indexing.get_or_create_storage_context") as mock_storage,
patch("paperless_ai.indexing.load_or_build_index") as mock_load_or_build_index,
patch(
"paperless_ai.indexing.vector_store_file_exists",
) as mock_vector_store_exists,
patch("paperless_ai.indexing.VectorIndexRetriever") as mock_retriever_cls,
patch("paperless_ai.indexing.Document.objects.filter") as mock_filter,
):
mock_storage.return_value = MagicMock()
mock_storage.return_value.persist_dir = temp_llm_index_dir
mock_vector_store_exists.return_value = True
mock_index = MagicMock()
mock_load_or_build_index.return_value = mock_index

View File

@@ -1,6 +0,0 @@
from django.apps import AppConfig
class PaperlessMigrationConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "paperless_migration"

View File

@@ -1,193 +0,0 @@
"""Settings for migration-mode Django instance."""
from __future__ import annotations
import os
from pathlib import Path
from typing import Any
from dotenv import load_dotenv
BASE_DIR = Path(__file__).resolve().parent.parent
DEBUG = False
ALLOWED_HOSTS = ["*"]
# Tap paperless.conf if it's available
for path in [
os.getenv("PAPERLESS_CONFIGURATION_PATH"),
"../paperless.conf",
"/etc/paperless.conf",
"/usr/local/etc/paperless.conf",
]:
if path and Path(path).exists():
load_dotenv(path)
break
def __get_path(
key: str,
default: str | Path,
) -> Path:
if key in os.environ:
return Path(os.environ[key]).resolve()
return Path(default).resolve()
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", BASE_DIR.parent / "data")
def _parse_db_settings() -> dict[str, dict[str, Any]]:
databases: dict[str, dict[str, Any]] = {
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": DATA_DIR / "db.sqlite3",
"OPTIONS": {},
},
}
if os.getenv("PAPERLESS_DBHOST"):
databases["sqlite"] = databases["default"].copy()
databases["default"] = {
"HOST": os.getenv("PAPERLESS_DBHOST"),
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
"USER": os.getenv("PAPERLESS_DBUSER", "paperless"),
"PASSWORD": os.getenv("PAPERLESS_DBPASS", "paperless"),
"OPTIONS": {},
}
if os.getenv("PAPERLESS_DBPORT"):
databases["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT")
if os.getenv("PAPERLESS_DBENGINE") == "mariadb":
engine = "django.db.backends.mysql"
options = {
"read_default_file": "/etc/mysql/my.cnf",
"charset": "utf8mb4",
"ssl_mode": os.getenv("PAPERLESS_DBSSLMODE", "PREFERRED"),
"ssl": {
"ca": os.getenv("PAPERLESS_DBSSLROOTCERT"),
"cert": os.getenv("PAPERLESS_DBSSLCERT"),
"key": os.getenv("PAPERLESS_DBSSLKEY"),
},
}
else:
engine = "django.db.backends.postgresql"
options = {
"sslmode": os.getenv("PAPERLESS_DBSSLMODE", "prefer"),
"sslrootcert": os.getenv("PAPERLESS_DBSSLROOTCERT"),
"sslcert": os.getenv("PAPERLESS_DBSSLCERT"),
"sslkey": os.getenv("PAPERLESS_DBSSLKEY"),
}
databases["default"]["ENGINE"] = engine
databases["default"]["OPTIONS"].update(options)
if os.getenv("PAPERLESS_DB_TIMEOUT") is not None:
timeout = int(os.getenv("PAPERLESS_DB_TIMEOUT"))
if databases["default"]["ENGINE"] == "django.db.backends.sqlite3":
databases["default"]["OPTIONS"].update({"timeout": timeout})
else:
databases["default"]["OPTIONS"].update({"connect_timeout": timeout})
databases["sqlite"]["OPTIONS"].update({"timeout": timeout})
return databases
DATABASES = _parse_db_settings()
SECRET_KEY = os.getenv(
"PAPERLESS_SECRET_KEY",
"e11fl1oa-*ytql8p)(06fbj4ukrlo+n7k&q5+$1md7i+mge=ee",
)
AUTH_PASSWORD_VALIDATORS = [
{
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
},
{
"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
},
{
"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
},
{
"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
},
]
LANGUAGE_CODE = "en-us"
TIME_ZONE = "UTC"
USE_I18N = True
USE_TZ = True
CSRF_TRUSTED_ORIGINS: list[str] = []
INSTALLED_APPS = [
"django.contrib.auth",
"django.contrib.contenttypes",
"django.contrib.sessions",
"django.contrib.messages",
"django.contrib.staticfiles",
"allauth",
"allauth.account",
"allauth.socialaccount",
"allauth.mfa",
"paperless_migration",
]
MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"django.middleware.clickjacking.XFrameOptionsMiddleware",
"allauth.account.middleware.AccountMiddleware",
]
ROOT_URLCONF = "paperless_migration.urls"
TEMPLATES = [
{
"BACKEND": "django.template.backends.django.DjangoTemplates",
"DIRS": [],
"APP_DIRS": True,
"OPTIONS": {
"context_processors": [
"django.template.context_processors.request",
"django.contrib.auth.context_processors.auth",
"django.contrib.messages.context_processors.messages",
],
},
},
]
WSGI_APPLICATION = "paperless_migration.wsgi.application"
AUTHENTICATION_BACKENDS = [
"django.contrib.auth.backends.ModelBackend",
"allauth.account.auth_backends.AuthenticationBackend",
]
STATIC_URL = "/static/"
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
LOGIN_URL = "/accounts/login/"
LOGIN_REDIRECT_URL = "/migration/"
LOGOUT_REDIRECT_URL = "/accounts/login/?loggedout=1"
ACCOUNT_ADAPTER = "allauth.account.adapter.DefaultAccountAdapter"
ACCOUNT_AUTHENTICATED_LOGIN_REDIRECTS = False
SOCIALACCOUNT_ADAPTER = "allauth.socialaccount.adapter.DefaultSocialAccountAdapter"
SOCIALACCOUNT_ENABLED = False
SESSION_ENGINE = "django.contrib.sessions.backends.db"
MIGRATION_EXPORT_PATH = os.getenv(
"PAPERLESS_MIGRATION_EXPORT_PATH",
"/data/export.json",
)
MIGRATION_TRANSFORMED_PATH = os.getenv(
"PAPERLESS_MIGRATION_TRANSFORMED_PATH",
"/data/export.v3.json",
)

View File

@@ -1,61 +0,0 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Paperless-ngx Migration Mode</title>
</head>
<body>
<main>
<h1>Migration Mode</h1>
<p>
This instance is running in migration mode. Use this interface to run
the v2 → v3 migration.
</p>
{% if messages %}
<ul>
{% for message in messages %}
<li>{{ message }}</li>
{% endfor %}
</ul>
{% endif %}
<section>
<h2>Step 1 — Export (v2)</h2>
<p>Expected export file:</p>
<ul>
<li><strong>Path:</strong> {{ export_path }}</li>
<li><strong>Status:</strong> {{ export_exists|yesno:"Found,Missing" }}</li>
</ul>
<form method="post">
{% csrf_token %}
<button type="submit" name="action" value="check">
Re-check export
</button>
</form>
</section>
<section>
<h2>Step 2 — Transform</h2>
<p>Expected transformed file:</p>
<ul>
<li><strong>Path:</strong> {{ transformed_path }}</li>
<li><strong>Status:</strong> {{ transformed_exists|yesno:"Found,Missing" }}</li>
</ul>
<form method="post">
{% csrf_token %}
<button type="submit" name="action" value="transform">
Transform export
</button>
</form>
</section>
<section>
<h2>Step 3 — Import (v3)</h2>
<form method="post">
{% csrf_token %}
<button type="submit" name="action" value="import">
Import transformed data
</button>
</form>
</section>
</main>
</body>
</html>

View File

@@ -1,9 +0,0 @@
from django.urls import include
from django.urls import path
from paperless_migration import views
urlpatterns = [
path("accounts/", include("allauth.urls")),
path("migration/", views.migration_home, name="migration_home"),
]

View File

@@ -1,46 +0,0 @@
from pathlib import Path
from django.contrib import messages
from django.contrib.auth.decorators import login_required
from django.http import HttpResponseForbidden
from django.shortcuts import redirect
from django.shortcuts import render
from django.views.decorators.http import require_http_methods
from paperless_migration import settings
@login_required
@require_http_methods(["GET", "POST"])
def migration_home(request):
if not request.user.is_superuser:
return HttpResponseForbidden("Superuser access required")
export_path = Path(settings.MIGRATION_EXPORT_PATH)
transformed_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
if request.method == "POST":
action = request.POST.get("action")
if action == "check":
messages.success(request, "Checked export paths.")
elif action == "transform":
messages.info(
request,
"Transform step is not implemented yet.",
)
elif action == "import":
messages.info(
request,
"Import step is not implemented yet.",
)
else:
messages.error(request, "Unknown action.")
return redirect("migration_home")
context = {
"export_path": export_path,
"export_exists": export_path.exists(),
"transformed_path": transformed_path,
"transformed_exists": transformed_path.exists(),
}
return render(request, "paperless_migration/migration_home.html", context)

View File

@@ -1,7 +0,0 @@
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless_migration.settings")
application = get_wsgi_application()