Initial version hacked up by Opus

This commit is contained in:
Trenton H
2026-01-29 10:06:02 -08:00
parent b44eea6508
commit 1c99e55069
13 changed files with 1128 additions and 479 deletions

View File

@@ -0,0 +1,28 @@
"""ASGI application for migration mode with WebSocket support."""
from __future__ import annotations
import os
from channels.auth import AuthMiddlewareStack
from channels.routing import ProtocolTypeRouter
from channels.routing import URLRouter
from channels.security.websocket import AllowedHostsOriginValidator
from django.core.asgi import get_asgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless_migration.settings")
# Initialize Django ASGI application early to ensure settings are loaded
django_asgi_app = get_asgi_application()
# Import routing after Django is initialized
from paperless_migration.routing import websocket_urlpatterns # noqa: E402
application = ProtocolTypeRouter(
{
"http": django_asgi_app,
"websocket": AllowedHostsOriginValidator(
AuthMiddlewareStack(URLRouter(websocket_urlpatterns)),
),
},
)

View File

@@ -0,0 +1,245 @@
"""WebSocket consumers for migration operations."""
from __future__ import annotations
import json
import logging
import os
import shutil
import tempfile
from pathlib import Path
from typing import Any
from channels.generic.websocket import AsyncWebsocketConsumer
from django.conf import settings
from paperless_migration.services.importer import ImportService
from paperless_migration.services.transform import TransformService
logger = logging.getLogger(__name__)
class MigrationConsumerBase(AsyncWebsocketConsumer):
"""Base consumer with common authentication and messaging logic."""
async def connect(self) -> None:
"""Authenticate and accept or reject the connection."""
user = self.scope.get("user")
session = self.scope.get("session", {})
if not user or not user.is_authenticated:
logger.warning("WebSocket connection rejected: not authenticated")
await self.close(code=4001)
return
if not user.is_superuser:
logger.warning("WebSocket connection rejected: not superuser")
await self.close(code=4003)
return
if not session.get("migration_code_ok"):
logger.warning("WebSocket connection rejected: migration code not verified")
await self.close(code=4002)
return
await self.accept()
logger.info("WebSocket connection accepted for user: %s", user.username)
async def disconnect(self, close_code: int) -> None:
"""Handle disconnection."""
logger.debug("WebSocket disconnected with code: %d", close_code)
async def receive(self, text_data: str | None = None, **kwargs: Any) -> None:
"""Handle incoming messages - triggers the operation."""
if text_data is None:
return
try:
data = json.loads(text_data)
except json.JSONDecodeError:
await self.send_error("Invalid JSON message")
return
action = data.get("action")
if action == "start":
await self.run_operation()
else:
await self.send_error(f"Unknown action: {action}")
async def run_operation(self) -> None:
"""Override in subclasses to run the specific operation."""
raise NotImplementedError
async def send_message(self, msg_type: str, **kwargs: Any) -> None:
"""Send a typed JSON message to the client."""
await self.send(text_data=json.dumps({"type": msg_type, **kwargs}))
async def send_log(self, message: str, level: str = "info") -> None:
"""Send a log message."""
await self.send_message("log", message=message, level=level)
async def send_progress(
self,
current: int,
total: int | None = None,
label: str = "",
) -> None:
"""Send a progress update."""
await self.send_message(
"progress",
current=current,
total=total,
label=label,
)
async def send_stats(self, stats: dict[str, Any]) -> None:
"""Send statistics update."""
await self.send_message("stats", **stats)
async def send_complete(
self,
duration: float,
*,
success: bool,
**kwargs: Any,
) -> None:
"""Send completion message."""
await self.send_message(
"complete",
success=success,
duration=duration,
**kwargs,
)
async def send_error(self, message: str) -> None:
"""Send an error message."""
await self.send_message("error", message=message)
class TransformConsumer(MigrationConsumerBase):
"""WebSocket consumer for transform operations."""
async def run_operation(self) -> None:
"""Run the transform operation."""
input_path = Path(settings.MIGRATION_EXPORT_PATH)
output_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
frequency = settings.MIGRATION_PROGRESS_FREQUENCY
if not input_path.exists():
await self.send_error(f"Export file not found: {input_path}")
return
if output_path.exists():
await self.send_error(
f"Output file already exists: {output_path}. "
"Delete it first to re-run transform.",
)
return
await self.send_log("Starting transform operation...")
service = TransformService(
input_path=input_path,
output_path=output_path,
update_frequency=frequency,
)
try:
async for update in service.run_async():
match update["type"]:
case "progress":
await self.send_progress(
current=update["completed"],
label=f"{update['completed']:,} rows processed",
)
if update.get("stats"):
await self.send_stats({"transformed": update["stats"]})
case "complete":
await self.send_complete(
success=True,
duration=update["duration"],
total_processed=update["total_processed"],
stats=update["stats"],
speed=update["speed"],
)
case "error":
await self.send_error(update["message"])
case "log":
await self.send_log(
update["message"],
update.get("level", "info"),
)
except Exception as exc:
logger.exception("Transform operation failed")
await self.send_error(f"Transform failed: {exc}")
class ImportConsumer(MigrationConsumerBase):
"""WebSocket consumer for import operations."""
async def run_operation(self) -> None:
"""Run the import operation (wipe, migrate, import)."""
export_path = Path(settings.MIGRATION_EXPORT_PATH)
transformed_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
imported_marker = Path(settings.MIGRATION_IMPORTED_PATH)
source_dir = export_path.parent
if not export_path.exists():
await self.send_error("Export file not found. Upload or re-check export.")
return
if not transformed_path.exists():
await self.send_error("Transformed file not found. Run transform first.")
return
await self.send_log("Preparing import operation...")
# Backup original manifest and swap in transformed version
backup_path: Path | None = None
try:
backup_fd, backup_name = tempfile.mkstemp(
prefix="manifest.v2.",
suffix=".json",
dir=source_dir,
)
os.close(backup_fd)
backup_path = Path(backup_name)
shutil.copy2(export_path, backup_path)
shutil.copy2(transformed_path, export_path)
await self.send_log("Manifest files prepared")
except Exception as exc:
await self.send_error(f"Failed to prepare import manifest: {exc}")
return
service = ImportService(
source_dir=source_dir,
imported_marker=imported_marker,
)
try:
async for update in service.run_async():
match update["type"]:
case "phase":
await self.send_log(f"Phase: {update['phase']}", level="info")
case "log":
await self.send_log(
update["message"],
update.get("level", "info"),
)
case "complete":
await self.send_complete(
success=update["success"],
duration=update["duration"],
)
case "error":
await self.send_error(update["message"])
except Exception as exc:
logger.exception("Import operation failed")
await self.send_error(f"Import failed: {exc}")
finally:
# Restore original manifest
if backup_path and backup_path.exists():
try:
shutil.move(str(backup_path), str(export_path))
except Exception as exc:
logger.warning("Failed to restore backup manifest: %s", exc)

View File

@@ -0,0 +1,13 @@
"""WebSocket URL routing for migration operations."""
from __future__ import annotations
from django.urls import path
from paperless_migration.consumers import ImportConsumer
from paperless_migration.consumers import TransformConsumer
websocket_urlpatterns = [
path("ws/migration/transform/", TransformConsumer.as_asgi()),
path("ws/migration/import/", ImportConsumer.as_asgi()),
]

View File

@@ -1,181 +0,0 @@
# /// script
# dependencies = [
# "rich",
# "ijson",
# "typer-slim",
# "websockets",
# ]
# ///
import json
import time
from collections import Counter
from collections.abc import Callable
from pathlib import Path
from typing import Any
from typing import TypedDict
import ijson
import typer
from rich.console import Console
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import SpinnerColumn
from rich.progress import TextColumn
from rich.progress import TimeElapsedColumn
from rich.table import Table
from websockets.sync.client import ClientConnection
from websockets.sync.client import connect
app = typer.Typer(add_completion=False)
console = Console()
class FixtureObject(TypedDict):
model: str
pk: int
fields: dict[str, Any]
TransformFn = Callable[[FixtureObject], FixtureObject]
def transform_documents_document(obj: FixtureObject) -> FixtureObject:
fields: dict[str, Any] = obj["fields"]
fields.pop("storage_type", None)
content: Any = fields.get("content")
fields["content_length"] = len(content) if isinstance(content, str) else 0
return obj
TRANSFORMS: dict[str, TransformFn] = {
"documents.document": transform_documents_document,
}
def validate_output(value: Path) -> Path:
if value.exists():
raise typer.BadParameter(f"Output file '{value}' already exists.")
return value
@app.command()
def migrate(
input_path: Path = typer.Option(
...,
"--input",
"-i",
exists=True,
file_okay=True,
dir_okay=False,
readable=True,
),
output_path: Path = typer.Option(
...,
"--output",
"-o",
callback=validate_output,
),
ws_url: str | None = typer.Option(None, "--ws"),
update_frequency: int = typer.Option(100, "--freq"),
) -> None:
"""
Process JSON fixtures with detailed summary and timing.
"""
if input_path.resolve() == output_path.resolve():
console.print(
"[bold red]Error:[/bold red] Input and output paths cannot be the same file.",
)
raise typer.Exit(code=1)
stats: Counter[str] = Counter()
total_processed: int = 0
start_time: float = time.perf_counter()
ws: ClientConnection | None = None
if ws_url:
try:
ws = connect(ws_url)
except Exception as e:
console.print(
f"[yellow]Warning: Could not connect to WebSocket: {e}[/yellow]",
)
progress = Progress(
SpinnerColumn(),
TextColumn("[bold blue]{task.description}"),
BarColumn(),
TextColumn("{task.completed:,} rows"),
TimeElapsedColumn(),
console=console,
)
try:
with (
progress,
input_path.open("rb") as infile,
output_path.open("w", encoding="utf-8") as outfile,
):
task = progress.add_task("Processing fixture", start=True)
outfile.write("[\n")
first: bool = True
for i, obj in enumerate(ijson.items(infile, "item")):
fixture: FixtureObject = obj
model: str = fixture["model"]
total_processed += 1
transform: TransformFn | None = TRANSFORMS.get(model)
if transform:
fixture = transform(fixture)
stats[model] += 1
if not first:
outfile.write(",\n")
first = False
json.dump(fixture, outfile, ensure_ascii=False)
progress.advance(task, 1)
if ws and (i % update_frequency == 0):
ws.send(
json.dumps(
{
"task": "processing",
"completed": total_processed,
"stats": dict(stats),
},
),
)
outfile.write("\n]\n")
finally:
if ws:
ws.close()
end_time: float = time.perf_counter()
duration: float = end_time - start_time
# Final Statistics Table
console.print("\n[bold green]Processing Complete[/bold green]")
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Metric", style="dim")
table.add_column("Value", justify="right")
table.add_row("Total Time", f"{duration:.2f} seconds")
table.add_row("Total Processed", f"{total_processed:,} rows")
table.add_row(
"Processing Speed",
f"{total_processed / duration:.0f} rows/sec" if duration > 0 else "N/A",
)
for model, count in stats.items():
table.add_row(f"Transformed: {model}", f"{count:,}")
console.print(table)
if __name__ == "__main__":
app()

View File

@@ -1,61 +0,0 @@
import django
from django.apps import apps
from django.db import connection
from django.db.migrations.recorder import MigrationRecorder
def _target_tables() -> list[str]:
tables = {
model._meta.db_table for model in apps.get_models(include_auto_created=True)
}
tables.add(MigrationRecorder.Migration._meta.db_table)
existing = set(connection.introspection.table_names())
return sorted(tables & existing)
def _drop_sqlite_tables() -> None:
tables = _target_tables()
with connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF;")
for table in tables:
cursor.execute(f'DROP TABLE IF EXISTS "{table}";')
cursor.execute("PRAGMA foreign_keys=ON;")
def _drop_postgres_tables() -> None:
tables = _target_tables()
if not tables:
return
with connection.cursor() as cursor:
for table in tables:
cursor.execute(f'DROP TABLE IF EXISTS "{table}" CASCADE;')
def _drop_mysql_tables() -> None:
tables = _target_tables()
with connection.cursor() as cursor:
cursor.execute("SET FOREIGN_KEY_CHECKS=0;")
for table in tables:
cursor.execute(f"DROP TABLE IF EXISTS `{table}`;")
cursor.execute("SET FOREIGN_KEY_CHECKS=1;")
def main() -> None:
django.setup()
vendor = connection.vendor
print(f"Wiping database for {vendor}...") # noqa: T201
if vendor == "sqlite":
_drop_sqlite_tables()
elif vendor == "postgresql":
_drop_postgres_tables()
elif vendor == "mysql":
_drop_mysql_tables()
else:
raise SystemExit(f"Unsupported database vendor: {vendor}")
print("Database wipe complete.") # noqa: T201
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,186 @@
"""Import service for loading transformed data into v3 database."""
from __future__ import annotations
import subprocess
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TypedDict
if TYPE_CHECKING:
from collections.abc import AsyncGenerator
from collections.abc import Generator
class ProgressUpdate(TypedDict, total=False):
"""Progress update message structure."""
type: str
phase: str
message: str
level: str
success: bool
duration: float
return_code: int
@dataclass
class ImportService:
"""Service for importing transformed data into v3 database.
This service orchestrates the three-phase import process:
1. Wipe the existing database
2. Run Django migrations for v3 schema
3. Import the transformed data
"""
source_dir: Path
imported_marker: Path
manage_path: Path | None = None
def __post_init__(self) -> None:
if self.manage_path is None:
# Default to manage.py in the src directory
self.manage_path = (
Path(__file__).resolve().parent.parent.parent / "manage.py"
)
def _get_env(self) -> dict[str, str]:
"""Get environment variables for subprocess calls."""
import os
env = os.environ.copy()
env["DJANGO_SETTINGS_MODULE"] = "paperless.settings"
env["PAPERLESS_MIGRATION_MODE"] = "0"
return env
def _run_command(
self,
args: list[str],
label: str,
) -> Generator[ProgressUpdate, None, int]:
"""Run a command and yield log lines. Returns the return code."""
yield {"type": "log", "message": f"Running: {label}", "level": "info"}
process = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
text=True,
env=self._get_env(),
)
try:
if process.stdout:
for line in process.stdout:
yield {
"type": "log",
"message": line.rstrip(),
"level": "info",
}
process.wait()
return process.returncode
finally:
if process.poll() is None:
process.kill()
def run_sync(self) -> Generator[ProgressUpdate, None, None]:
"""Run the import synchronously, yielding progress updates.
This orchestrates:
1. Database wipe
2. Django migrations
3. Document import
"""
start_time = time.perf_counter()
# Phase 1: Wipe database
yield {"type": "phase", "phase": "wipe"}
wipe_cmd = [
sys.executable,
"-m",
"paperless_migration.services.wipe_db",
]
wipe_code = yield from self._run_command(wipe_cmd, "Database wipe")
if wipe_code != 0:
yield {
"type": "error",
"message": f"Database wipe failed with code {wipe_code}",
}
return
yield {"type": "log", "message": "Database wipe complete", "level": "info"}
# Phase 2: Run migrations
yield {"type": "phase", "phase": "migrate"}
migrate_cmd = [
sys.executable,
str(self.manage_path),
"migrate",
"--noinput",
]
migrate_code = yield from self._run_command(migrate_cmd, "Django migrations")
if migrate_code != 0:
yield {
"type": "error",
"message": f"Migrations failed with code {migrate_code}",
}
return
yield {"type": "log", "message": "Migrations complete", "level": "info"}
# Phase 3: Import data
yield {"type": "phase", "phase": "import"}
import_cmd = [
sys.executable,
str(self.manage_path),
"document_importer",
str(self.source_dir),
"--data-only",
]
import_code = yield from self._run_command(import_cmd, "Document import")
if import_code != 0:
yield {
"type": "error",
"message": f"Import failed with code {import_code}",
}
return
# Mark import as complete
try:
self.imported_marker.parent.mkdir(parents=True, exist_ok=True)
self.imported_marker.write_text("ok\n", encoding="utf-8")
except Exception as exc:
yield {
"type": "log",
"message": f"Warning: Could not write import marker: {exc}",
"level": "warning",
}
end_time = time.perf_counter()
duration = end_time - start_time
yield {
"type": "complete",
"success": True,
"duration": duration,
}
async def run_async(self) -> AsyncGenerator[ProgressUpdate, None]:
"""Run the import asynchronously, yielding progress updates.
This wraps the synchronous implementation to work with async consumers.
"""
import asyncio
for update in self.run_sync():
yield update
# Yield control to the event loop
await asyncio.sleep(0)

View File

@@ -0,0 +1,173 @@
"""Transform service for converting v2 exports to v3 format."""
from __future__ import annotations
import json
import time
from collections import Counter
from collections.abc import AsyncGenerator
from collections.abc import Callable
from collections.abc import Generator
from dataclasses import dataclass
from dataclasses import field
from typing import TYPE_CHECKING
from typing import Any
from typing import TypedDict
import ijson
if TYPE_CHECKING:
from pathlib import Path
class FixtureObject(TypedDict):
"""Structure of a Django fixture object."""
model: str
pk: int
fields: dict[str, Any]
class ProgressUpdate(TypedDict, total=False):
"""Progress update message structure."""
type: str
completed: int
stats: dict[str, int]
message: str
level: str
duration: float
total_processed: int
speed: float
TransformFn = Callable[[FixtureObject], FixtureObject]
def transform_documents_document(obj: FixtureObject) -> FixtureObject:
"""Transform a documents.document fixture object for v3 schema."""
fields: dict[str, Any] = obj["fields"]
fields.pop("storage_type", None)
content: Any = fields.get("content")
fields["content_length"] = len(content) if isinstance(content, str) else 0
return obj
# Registry of model-specific transforms
TRANSFORMS: dict[str, TransformFn] = {
"documents.document": transform_documents_document,
}
@dataclass
class TransformService:
"""Service for transforming v2 exports to v3 format.
This service processes JSON fixtures incrementally using ijson for
memory-efficient streaming, and yields progress updates suitable
for WebSocket transmission.
"""
input_path: Path
output_path: Path
update_frequency: int = 100
_stats: Counter[str] = field(default_factory=Counter, init=False)
_total_processed: int = field(default=0, init=False)
def validate(self) -> str | None:
"""Validate preconditions for transform. Returns error message or None."""
if not self.input_path.exists():
return f"Input file not found: {self.input_path}"
if self.output_path.exists():
return f"Output file already exists: {self.output_path}"
if self.input_path.resolve() == self.output_path.resolve():
return "Input and output paths cannot be the same file"
return None
def _process_fixture(self, obj: FixtureObject) -> FixtureObject:
"""Apply any registered transforms to a fixture object."""
model: str = obj["model"]
transform: TransformFn | None = TRANSFORMS.get(model)
if transform:
obj = transform(obj)
self._stats[model] += 1
return obj
def run_sync(self) -> Generator[ProgressUpdate, None, None]:
"""Run the transform synchronously, yielding progress updates.
This is the core implementation that processes the JSON file
and yields progress updates at regular intervals.
"""
error = self.validate()
if error:
yield {"type": "error", "message": error}
return
self._stats.clear()
self._total_processed = 0
start_time = time.perf_counter()
yield {"type": "log", "message": "Opening input file...", "level": "info"}
try:
with (
self.input_path.open("rb") as infile,
self.output_path.open("w", encoding="utf-8") as outfile,
):
outfile.write("[\n")
first = True
for i, obj in enumerate(ijson.items(infile, "item")):
fixture: FixtureObject = obj
fixture = self._process_fixture(fixture)
self._total_processed += 1
if not first:
outfile.write(",\n")
first = False
json.dump(fixture, outfile, ensure_ascii=False)
# Yield progress at configured frequency
if i > 0 and i % self.update_frequency == 0:
yield {
"type": "progress",
"completed": self._total_processed,
"stats": dict(self._stats),
}
outfile.write("\n]\n")
except Exception as exc:
# Clean up partial output on error
if self.output_path.exists():
self.output_path.unlink()
yield {"type": "error", "message": str(exc)}
return
end_time = time.perf_counter()
duration = end_time - start_time
speed = self._total_processed / duration if duration > 0 else 0
yield {
"type": "complete",
"duration": duration,
"total_processed": self._total_processed,
"stats": dict(self._stats),
"speed": speed,
}
async def run_async(self) -> AsyncGenerator[ProgressUpdate, None]:
"""Run the transform asynchronously, yielding progress updates.
This wraps the synchronous implementation to work with async consumers.
The actual I/O is done synchronously since ijson doesn't support async,
but we yield control periodically to keep the event loop responsive.
"""
import asyncio
for update in self.run_sync():
yield update
# Yield control to the event loop periodically
await asyncio.sleep(0)

View File

@@ -0,0 +1,115 @@
"""Database wipe service for migration import process.
This module can be run as a script via:
python -m paperless_migration.services.wipe_db
It uses the paperless_migration settings to wipe all tables
before running v3 migrations.
"""
from __future__ import annotations
import logging
import sys
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from django.db.backends.base.base import BaseDatabaseWrapper
logger = logging.getLogger(__name__)
def _get_target_tables(connection: BaseDatabaseWrapper) -> list[str]:
"""Get list of tables to drop that exist in the database."""
from django.apps import apps
from django.db.migrations.recorder import MigrationRecorder
model_tables = {
model._meta.db_table for model in apps.get_models(include_auto_created=True)
}
model_tables.add(MigrationRecorder.Migration._meta.db_table)
existing_tables = set(connection.introspection.table_names())
return sorted(model_tables & existing_tables)
def _drop_sqlite_tables(connection: BaseDatabaseWrapper) -> int:
"""Drop tables for SQLite database. Returns count of tables dropped."""
tables = _get_target_tables(connection)
with connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF;")
for table in tables:
cursor.execute(f'DROP TABLE IF EXISTS "{table}";')
cursor.execute("PRAGMA foreign_keys=ON;")
return len(tables)
def _drop_postgres_tables(connection: BaseDatabaseWrapper) -> int:
"""Drop tables for PostgreSQL database. Returns count of tables dropped."""
tables = _get_target_tables(connection)
if not tables:
return 0
with connection.cursor() as cursor:
for table in tables:
cursor.execute(f'DROP TABLE IF EXISTS "{table}" CASCADE;')
return len(tables)
def _drop_mysql_tables(connection: BaseDatabaseWrapper) -> int:
"""Drop tables for MySQL/MariaDB database. Returns count of tables dropped."""
tables = _get_target_tables(connection)
with connection.cursor() as cursor:
cursor.execute("SET FOREIGN_KEY_CHECKS=0;")
for table in tables:
cursor.execute(f"DROP TABLE IF EXISTS `{table}`;")
cursor.execute("SET FOREIGN_KEY_CHECKS=1;")
return len(tables)
def wipe_database() -> tuple[bool, str]:
"""Wipe all application tables from the database.
Returns:
Tuple of (success: bool, message: str)
"""
from django.db import connection
vendor = connection.vendor
logger.info("Wiping database for vendor: %s", vendor)
try:
match vendor:
case "sqlite":
count = _drop_sqlite_tables(connection)
case "postgresql":
count = _drop_postgres_tables(connection)
case "mysql":
count = _drop_mysql_tables(connection)
case _:
return False, f"Unsupported database vendor: {vendor}"
message = f"Dropped {count} tables from {vendor} database"
logger.info(message)
return True, message
except Exception as exc:
message = f"Failed to wipe database: {exc}"
logger.exception(message)
return False, message
def main() -> int:
"""Entry point when run as a script."""
import os
import django
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless_migration.settings")
django.setup()
success, message = wipe_database()
print(message) # noqa: T201
return 0 if success else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -4,7 +4,6 @@ from __future__ import annotations
import logging
import os
import secrets
from pathlib import Path
from typing import Any
@@ -41,6 +40,11 @@ DATA_DIR = __get_path("PAPERLESS_DATA_DIR", BASE_DIR.parent / "data")
EXPORT_DIR = __get_path("PAPERLESS_EXPORT_DIR", BASE_DIR.parent / "export")
def _parse_redis_url() -> str:
"""Parse Redis URL from environment with sensible defaults."""
return os.getenv("PAPERLESS_REDIS_URL", "redis://localhost:6379")
def _parse_db_settings() -> dict[str, dict[str, Any]]:
databases: dict[str, dict[str, Any]] = {
"default": {
@@ -97,9 +101,7 @@ def _parse_db_settings() -> dict[str, dict[str, Any]]:
DATABASES = _parse_db_settings()
SECRET_KEY = os.getenv(
"PAPERLESS_SECRET_KEY",
)
SECRET_KEY = os.getenv("PAPERLESS_SECRET_KEY")
AUTH_PASSWORD_VALIDATORS = [
{
@@ -128,6 +130,7 @@ INSTALLED_APPS = [
"django.contrib.sessions",
"django.contrib.messages",
"django.contrib.staticfiles",
"channels",
"allauth",
"allauth.account",
"allauth.socialaccount",
@@ -166,6 +169,24 @@ TEMPLATES = [
},
]
# ASGI application for Channels
ASGI_APPLICATION = "paperless_migration.asgi.application"
# Channel layers configuration using Redis
REDIS_URL = _parse_redis_url()
CHANNEL_LAYERS = {
"default": {
"BACKEND": "channels_redis.core.RedisChannelLayer",
"CONFIG": {
"hosts": [REDIS_URL],
"capacity": 1500,
"expiry": 10,
},
},
}
# Keep WSGI for compatibility
WSGI_APPLICATION = "paperless_migration.wsgi.application"
AUTHENTICATION_BACKENDS = [
@@ -203,9 +224,16 @@ MIGRATION_TRANSFORMED_PATH = __get_path(
)
MIGRATION_IMPORTED_PATH = Path(EXPORT_DIR / "import.completed").resolve()
# Progress update frequency (rows between WebSocket updates)
MIGRATION_PROGRESS_FREQUENCY = int(
os.getenv("PAPERLESS_MIGRATION_PROGRESS_FREQUENCY", "100"),
)
# One-time access code required for migration logins; stable across autoreload
_code = os.getenv("PAPERLESS_MIGRATION_ACCESS_CODE")
if not _code:
import secrets
_code = secrets.token_urlsafe(12)
os.environ["PAPERLESS_MIGRATION_ACCESS_CODE"] = _code
MIGRATION_ACCESS_CODE = _code

View File

@@ -16,12 +16,12 @@
--bs-border-color: #dee2e6;
--bs-link-color: var(--pngx-primary);
--bs-link-color-rgb: 23, 84, 31;
}
}
@media (prefers-color-scheme: dark) { :root { color-scheme: light; } }
.btn-primary:disabled {
--bs-btn-disabled-bg: #4d7352;
--bs-btn-disabled-border-color: #4d7352;
.btn-primary:disabled {
--bs-btn-disabled-bg: #4d7352;
--bs-btn-disabled-border-color: #4d7352;
}
body {
@@ -32,9 +32,9 @@
min-height: 100vh;
}
svg.logo .text {
fill: #161616 !important;
}
svg.logo .text {
fill: #161616 !important;
}
.hero-card,
.card-step {
@@ -106,6 +106,87 @@
color: #fff;
border-color: #17541f;
}
.console-log {
background: #0f1a12;
color: #d1e7d6;
border-radius: 12px;
min-height: 180px;
max-height: 400px;
padding: 12px;
font-size: 0.85rem;
font-family: 'Consolas', 'Monaco', monospace;
overflow: auto;
white-space: pre-wrap;
word-break: break-word;
}
.console-log .log-error { color: #ff6b6b; }
.console-log .log-warning { color: #ffd93d; }
.console-log .log-success { color: #6bcb77; }
.console-log .log-info { color: #4d96ff; }
.progress-bar-container {
height: 24px;
background: rgba(23, 84, 31, 0.1);
border-radius: 12px;
overflow: hidden;
margin-bottom: 0.5rem;
}
.progress-bar-fill {
height: 100%;
background: linear-gradient(90deg, #17541f, #2c7a3c);
border-radius: 12px;
transition: width 0.3s ease;
display: flex;
align-items: center;
justify-content: center;
color: white;
font-size: 0.75rem;
font-weight: 600;
min-width: fit-content;
padding: 0 8px;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
gap: 0.5rem;
margin-top: 0.5rem;
}
.stat-item {
background: rgba(23, 84, 31, 0.05);
border-radius: 8px;
padding: 0.5rem;
text-align: center;
}
.stat-value {
font-size: 1.25rem;
font-weight: 700;
color: #17541f;
}
.stat-label {
font-size: 0.75rem;
color: #666;
}
.ws-status {
display: inline-flex;
align-items: center;
gap: 0.5rem;
padding: 0.25rem 0.75rem;
border-radius: 999px;
font-size: 0.8rem;
font-weight: 500;
}
.ws-status.connected { background: #d4edda; color: #155724; }
.ws-status.disconnected { background: #f8d7da; color: #721c24; }
.ws-status.connecting { background: #fff3cd; color: #856404; }
</style>
</head>
<body class="pb-4">
@@ -118,7 +199,7 @@
{% include "paperless-ngx/snippets/svg_logo.html" with extra_attrs="width='280' class='logo'" %}
<div class="ps-2">
<p class="text-uppercase fw-semibold mb-1 text-secondary" style="letter-spacing: 0.12rem;">Migration Mode</p>
<h1 class="h3 mb-2 text-primary">Paperless-ngx v2 v3</h1>
<h1 class="h3 mb-2 text-primary">Paperless-ngx v2 to v3</h1>
<p class="text-muted mb-0">Migrate your data from Paperless-ngx version 2 to version 3.</p>
</div>
</div>
@@ -184,8 +265,8 @@
</div>
</div>
</div>
</div>
</div>
</div>
<div class="row gy-4 justify-content-center">
<div class="col-lg-3 col-md-4">
@@ -219,7 +300,7 @@
<h3 class="h5 mb-1">Transform</h3>
<p class="small text-muted mb-0">Convert the export into the v3-ready structure.</p>
</div>
<div class="mt-auto">
<div class="mt-auto d-grid gap-2">
<form method="post">
{% csrf_token %}
<button
@@ -227,11 +308,20 @@
type="submit"
name="action"
value="transform"
id="btn-transform"
{% if not export_exists or transformed_exists %}disabled aria-disabled="true"{% endif %}
>
Transform export
</button>
</form>
{% if transformed_exists %}
<form method="post">
{% csrf_token %}
<button class="btn btn-outline-danger btn-sm w-100" type="submit" name="action" value="reset_transform">
Reset transform
</button>
</form>
{% endif %}
</div>
</div>
</div>
@@ -253,6 +343,7 @@
type="submit"
name="action"
value="import"
id="btn-import"
{% if not transformed_exists or imported_exists %}disabled aria-disabled="true"{% endif %}
>
Import transformed data
@@ -272,8 +363,8 @@
<div class="small">
Run the v2 export from your Paperless instance, e.g.:
<code>docker run --rm ghcr.io/paperless-ngx/paperless-ngx:2.20.6 document_exporter --data-only</code>
(see <a href="https://docs.paperless-ngx.com/administration/#exporter" target="_blank" rel="noopener noreferrer">documentation</a>). Once the <code>manifest.json</code> is in-place, upload it or (especially for larger files) place it directly at the expected location and click Re-check export.
<p class="mt-2 mb-0 text-danger fst-italic">⚠️ The export must be generated with version Paperless-ngx v2.20.6</p>
(see <a href="https://docs.paperless-ngx.com/administration/#exporter" target="_blank" rel="noopener noreferrer">documentation</a>). Once the <code>manifest.json</code> is in-place, upload it or (especially for larger files) place it directly at the expected location and click "Re-check export".
<p class="mt-2 mb-0 text-danger fst-italic">Warning: The export must be generated with version Paperless-ngx v2.20.6</p>
</div>
</div>
{% endif %}
@@ -281,38 +372,187 @@
<div class="card-body">
<div class="d-flex justify-content-between align-items-center mb-2">
<div class="fw-semibold">Migration console</div>
<span class="badge bg-secondary-subtle text-secondary border border-secondary-subtle">Live output</span>
<span id="ws-status" class="ws-status disconnected">
<span class="status-dot"></span>
<span class="status-text">Ready</span>
</span>
</div>
<pre id="migration-log" class="mb-0" style="background:#0f1a12;color:#d1e7d6;border-radius:12px;min-height:180px;padding:12px;font-size:0.9rem;overflow:auto;">Ready</pre>
<div id="progress-container" class="mb-3" style="display: none;">
<div class="progress-bar-container">
<div id="progress-bar" class="progress-bar-fill" style="width: 0%;">
<span id="progress-text">0 rows</span>
</div>
</div>
<div id="stats-container" class="stats-grid"></div>
</div>
<div id="migration-log" class="console-log">Ready to begin migration...</div>
</div>
</div>
</div>
</div>
</div>
{% if stream_action %}
<script>
(() => {
const logEl = document.getElementById('migration-log');
if (!logEl) return;
const streamUrl = "{% if stream_action == 'import' %}{% url 'import_stream' %}{% else %}{% url 'transform_stream' %}{% endif %}";
const donePrefix = "{{ stream_action|capfirst }} finished";
const evt = new EventSource(streamUrl);
const append = (line) => {
logEl.textContent += `\n${line}`;
logEl.scrollTop = logEl.scrollHeight;
<script>
(function() {
const logEl = document.getElementById('migration-log');
const wsStatusEl = document.getElementById('ws-status');
const progressContainer = document.getElementById('progress-container');
const progressBar = document.getElementById('progress-bar');
const progressText = document.getElementById('progress-text');
const statsContainer = document.getElementById('stats-container');
function setWsStatus(status, text) {
wsStatusEl.className = 'ws-status ' + status;
wsStatusEl.querySelector('.status-text').textContent = text;
}
function appendLog(message, level) {
const line = document.createElement('div');
line.className = 'log-' + (level || 'info');
line.textContent = message;
logEl.appendChild(line);
logEl.scrollTop = logEl.scrollHeight;
}
function clearLog() {
logEl.innerHTML = '';
}
function updateProgress(current, total, label) {
progressContainer.style.display = 'block';
const pct = total ? Math.min(100, (current / total) * 100) : 0;
progressBar.style.width = (total ? pct : 100) + '%';
progressText.textContent = label || (current.toLocaleString() + ' rows');
}
function updateStats(stats) {
if (!stats || Object.keys(stats).length === 0) {
statsContainer.innerHTML = '';
return;
}
let html = '';
for (const [key, value] of Object.entries(stats)) {
const label = key.replace('documents.', '').replace('_', ' ');
html += '<div class="stat-item">' +
'<div class="stat-value">' + (typeof value === 'number' ? value.toLocaleString() : value) + '</div>' +
'<div class="stat-label">' + label + '</div>' +
'</div>';
}
statsContainer.innerHTML = html;
}
function formatDuration(seconds) {
if (seconds < 60) return seconds.toFixed(1) + 's';
const mins = Math.floor(seconds / 60);
const secs = (seconds % 60).toFixed(0);
return mins + 'm ' + secs + 's';
}
function startWebSocket(action) {
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsUrl = protocol + '//' + window.location.host + '/ws/migration/' + action + '/';
clearLog();
appendLog('Connecting to ' + action + ' service...', 'info');
setWsStatus('connecting', 'Connecting...');
progressContainer.style.display = 'none';
statsContainer.innerHTML = '';
const ws = new WebSocket(wsUrl);
ws.onopen = function() {
setWsStatus('connected', 'Connected');
appendLog('Connected. Starting ' + action + '...', 'success');
ws.send(JSON.stringify({ action: 'start' }));
};
evt.onmessage = (e) => {
append(e.data);
if (e.data.startsWith(donePrefix)) {
setTimeout(() => window.location.reload(), 500);
ws.onmessage = function(event) {
try {
const data = JSON.parse(event.data);
switch (data.type) {
case 'log':
appendLog(data.message, data.level || 'info');
break;
case 'progress':
updateProgress(data.current, data.total, data.label);
break;
case 'stats':
if (data.transformed) {
updateStats(data.transformed);
} else {
updateStats(data);
}
break;
case 'complete':
const status = data.success ? 'success' : 'error';
const msg = data.success
? 'Completed successfully in ' + formatDuration(data.duration)
: 'Operation failed';
appendLog(msg, status);
if (data.total_processed) {
appendLog('Total processed: ' + data.total_processed.toLocaleString() + ' rows', 'info');
}
if (data.speed) {
appendLog('Speed: ' + Math.round(data.speed).toLocaleString() + ' rows/sec', 'info');
}
if (data.stats) {
updateStats(data.stats);
}
setWsStatus('disconnected', 'Complete');
ws.close();
if (data.success) {
setTimeout(function() { window.location.reload(); }, 1500);
}
break;
case 'error':
appendLog('Error: ' + data.message, 'error');
setWsStatus('disconnected', 'Error');
break;
default:
appendLog(JSON.stringify(data), 'info');
}
} catch (e) {
appendLog('Received: ' + event.data, 'info');
}
};
evt.onerror = () => {
append('[connection closed]');
evt.close();
ws.onerror = function(error) {
appendLog('WebSocket error occurred', 'error');
setWsStatus('disconnected', 'Error');
};
})();
</script>
{% endif %}
ws.onclose = function(event) {
if (event.code !== 1000) {
const reason = event.code === 4001 ? 'Not authenticated'
: event.code === 4002 ? 'Migration code not verified'
: event.code === 4003 ? 'Superuser access required'
: 'Connection closed (code: ' + event.code + ')';
appendLog(reason, 'error');
}
setWsStatus('disconnected', 'Disconnected');
};
}
// Check if we should auto-start a WebSocket action
{% if ws_action %}
startWebSocket('{{ ws_action }}');
{% endif %}
// Expose for manual triggering if needed
window.startMigrationWs = startWebSocket;
})();
</script>
</body>
</html>

View File

@@ -1,5 +1,8 @@
"""URL configuration for migration mode."""
from __future__ import annotations
from django.conf import settings
from django.conf.urls.static import static
from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.urls import include
from django.urls import path
@@ -10,12 +13,9 @@ urlpatterns = [
path("accounts/login/", views.migration_login, name="account_login"),
path("accounts/", include("allauth.urls")),
path("migration/", views.migration_home, name="migration_home"),
path("migration/transform/stream", views.transform_stream, name="transform_stream"),
path("migration/import/stream", views.import_stream, name="import_stream"),
# redirect root to migration home
path("", views.migration_home, name="migration_home"),
# Redirect root to migration home
path("", views.migration_home, name="home"),
]
if settings.DEBUG:
urlpatterns += staticfiles_urlpatterns()
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)

View File

@@ -1,30 +1,41 @@
import os
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
"""Views for migration mode web interface."""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from django.conf import settings
from django.contrib import messages
from django.contrib.auth import authenticate
from django.contrib.auth import login
from django.contrib.auth.decorators import login_required
from django.http import HttpResponseForbidden
from django.http import StreamingHttpResponse
from django.shortcuts import redirect
from django.shortcuts import render
from django.views.decorators.http import require_http_methods
from paperless_migration import settings
if TYPE_CHECKING:
from django.http import HttpRequest
from django.http import HttpResponse
@login_required
@require_http_methods(["GET", "POST"])
def migration_home(request):
def _check_migration_access(request: HttpRequest) -> HttpResponse | None:
"""Check if user has migration access. Returns error response or None."""
if not request.session.get("migration_code_ok"):
return HttpResponseForbidden("Access code required")
if not request.user.is_superuser:
return HttpResponseForbidden("Superuser access required")
return None
@login_required
@require_http_methods(["GET", "POST"])
def migration_home(request: HttpRequest) -> HttpResponse:
"""Main migration dashboard view."""
error_response = _check_migration_access(request)
if error_response:
return error_response
export_path = Path(settings.MIGRATION_EXPORT_PATH)
transformed_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
@@ -32,13 +43,10 @@ def migration_home(request):
if request.method == "POST":
action = request.POST.get("action")
if action == "check":
messages.success(request, "Checked export paths.")
elif action == "transform":
messages.info(request, "Starting transform… live output below.")
request.session["start_stream_action"] = "transform"
if imported_marker.exists():
imported_marker.unlink()
elif action == "upload":
upload = request.FILES.get("export_file")
if not upload:
@@ -52,27 +60,53 @@ def migration_home(request):
messages.success(request, f"Uploaded to {export_path}.")
except Exception as exc:
messages.error(request, f"Failed to save file: {exc}")
elif action == "transform":
if imported_marker.exists():
imported_marker.unlink()
# Signal to start WebSocket connection for transform
request.session["start_ws_action"] = "transform"
messages.info(request, "Starting transform via WebSocket...")
elif action == "import":
messages.info(request, "Starting import… live output below.")
request.session["start_stream_action"] = "import"
# Signal to start WebSocket connection for import
request.session["start_ws_action"] = "import"
messages.info(request, "Starting import via WebSocket...")
elif action == "reset_transform":
if transformed_path.exists():
try:
transformed_path.unlink()
messages.success(request, "Transformed file deleted.")
except Exception as exc:
messages.error(request, f"Failed to delete transformed file: {exc}")
if imported_marker.exists():
try:
imported_marker.unlink()
except Exception:
pass
else:
messages.error(request, "Unknown action.")
return redirect("migration_home")
stream_action = request.session.pop("start_stream_action", None)
ws_action = request.session.pop("start_ws_action", None)
context = {
"export_path": export_path,
"export_exists": export_path.exists(),
"transformed_path": transformed_path,
"transformed_exists": transformed_path.exists(),
"imported_exists": imported_marker.exists(),
"stream_action": stream_action,
"ws_action": ws_action,
}
return render(request, "paperless_migration/migration_home.html", context)
@require_http_methods(["GET", "POST"])
def migration_login(request):
def migration_login(request: HttpRequest) -> HttpResponse:
"""Migration-specific login view requiring access code."""
if request.method == "POST":
username = request.POST.get("login", "")
password = request.POST.get("password", "")
@@ -96,174 +130,3 @@ def migration_login(request):
return redirect(settings.LOGIN_REDIRECT_URL)
return render(request, "account/login.html")
@login_required
@require_http_methods(["GET"])
def transform_stream(request):
if not request.session.get("migration_code_ok"):
return HttpResponseForbidden("Access code required")
if not request.user.is_superuser:
return HttpResponseForbidden("Superuser access required")
input_path = Path(settings.MIGRATION_EXPORT_PATH)
output_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
cmd = [
sys.executable,
"-m",
"paperless_migration.scripts.transform",
"--input",
str(input_path),
"--output",
str(output_path),
]
def event_stream():
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
text=True,
)
try:
yield "data: Starting transform...\n\n"
if process.stdout:
for line in process.stdout:
yield f"data: {line.rstrip()}\n\n"
process.wait()
yield f"data: Transform finished with code {process.returncode}\n\n"
finally:
if process and process.poll() is None:
process.kill()
return StreamingHttpResponse(
event_stream(),
content_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
@login_required
@require_http_methods(["GET"])
def import_stream(request):
if not request.session.get("migration_code_ok"):
return HttpResponseForbidden("Access code required")
if not request.user.is_superuser:
return HttpResponseForbidden("Superuser access required")
export_path = Path(settings.MIGRATION_EXPORT_PATH)
transformed_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
imported_marker = Path(settings.MIGRATION_IMPORTED_PATH)
manage_path = Path(settings.BASE_DIR) / "manage.py"
source_dir = export_path.parent
env = os.environ.copy()
env["DJANGO_SETTINGS_MODULE"] = "paperless.settings"
env["PAPERLESS_MIGRATION_MODE"] = "0"
def event_stream():
if not export_path.exists():
yield "data: Missing export manifest.json; upload or re-check export.\n\n"
return
if not transformed_path.exists():
yield "data: Missing transformed manifest.v3.json; run transform first.\n\n"
return
backup_path: Path | None = None
try:
backup_fd, backup_name = tempfile.mkstemp(
prefix="manifest.v2.",
suffix=".json",
dir=source_dir,
)
os.close(backup_fd)
backup_path = Path(backup_name)
shutil.copy2(export_path, backup_path)
shutil.copy2(transformed_path, export_path)
except Exception as exc:
yield f"data: Failed to prepare import manifest: {exc}\n\n"
return
def run_cmd(args, label):
yield f"data: {label}\n\n"
process = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
text=True,
env=env,
)
try:
if process.stdout:
for line in process.stdout:
yield f"data: {line.rstrip()}\n\n"
process.wait()
return process.returncode
finally:
if process and process.poll() is None:
process.kill()
wipe_cmd = [
sys.executable,
"-m",
"paperless_migration.scripts.wipe_db",
]
migrate_cmd = [
sys.executable,
str(manage_path),
"migrate",
"--noinput",
]
import_cmd = [
sys.executable,
str(manage_path),
"document_importer",
str(source_dir),
"--data-only",
]
try:
wipe_code = yield from run_cmd(
wipe_cmd,
"Wiping database...",
)
if wipe_code != 0:
yield f"data: Wipe finished with code {wipe_code}\n\n"
return
migrate_code = yield from run_cmd(
migrate_cmd,
"Running migrations...",
)
if migrate_code != 0:
yield f"data: Migrate finished with code {migrate_code}\n\n"
return
import_code = yield from run_cmd(
import_cmd,
"Starting import...",
)
if import_code == 0:
imported_marker.parent.mkdir(parents=True, exist_ok=True)
imported_marker.write_text("ok\n", encoding="utf-8")
yield f"data: Import finished with code {import_code}\n\n"
finally:
if backup_path and backup_path.exists():
try:
shutil.move(backup_path, export_path)
except Exception:
pass
return StreamingHttpResponse(
event_stream(),
content_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)