Compare commits

..

38 Commits

Author SHA1 Message Date
Trenton H
38df71b71a Missing ijson 2026-01-29 10:39:11 -08:00
Trenton H
7bae6b7f6d Fixes the merge conflicts I missed 2026-01-29 10:35:10 -08:00
Trenton H
1c99e55069 Initial version hacked up by Opus 2026-01-29 10:06:02 -08:00
Trenton H
b44eea6508 The websocket included script 2026-01-29 09:39:52 -08:00
Trenton H
b8af971652 Merge remote-tracking branch 'origin/dev' into feature-migrator-application 2026-01-29 09:28:24 -08:00
shamoon
e1655045ca Ready
[ci skip]
2026-01-23 22:04:58 -08:00
shamoon
1a638d8cc0 drop, migrate, then import 2026-01-23 22:04:44 -08:00
shamoon
b21ff75a30 Run importer 2026-01-23 21:50:01 -08:00
shamoon
58f1a186d4 2.20.6 2026-01-23 21:38:02 -08:00
shamoon
2a1c06c047 Merge branch 'feature/migrator' of https://github.com/paperless-ngx/paperless-ngx into feature/migrator 2026-01-23 21:33:27 -08:00
shamoon
770dc02833 Add root URL redirect to migration home 2026-01-23 21:31:51 -08:00
shamoon
af9d75dfcf Fix static files again 2026-01-23 21:31:34 -08:00
shamoon
7b23cdc0c1 Opacify complete steps 2026-01-23 15:44:07 -08:00
shamoon
09892809f9 Tweak instructions 2026-01-23 15:37:53 -08:00
shamoon
94c6108006 Nice, upload button 2026-01-23 15:29:19 -08:00
shamoon
33c5d5bab0 Update migration_home.html
[ci skip]
2026-01-23 08:54:00 -08:00
shamoon
9beb508f1d Auto-step after transform
[ci skip]
2026-01-23 08:40:37 -08:00
shamoon
a290fcfe6f Sick, run transform as subprocess 2026-01-23 08:39:57 -08:00
shamoon
0846fe9845 Script deps 2026-01-23 08:33:12 -08:00
shamoon
910d16374b Stumpylog's current version of the transform script
[ci skip]

Co-Authored-By: Trenton H <797416+stumpylog@users.noreply.github.com>
2026-01-23 08:22:34 -08:00
shamoon
35d77b144d Small startup detection thing 2026-01-23 08:11:34 -08:00
shamoon
5987e35101 Dummy console thing
[ci skip]
2026-01-22 23:28:26 -08:00
shamoon
96259ce441 Export instructions 2026-01-22 23:27:30 -08:00
shamoon
283afb265d Update settings.py
[ci skip]
2026-01-22 23:12:47 -08:00
shamoon
67564dd573 more light mode shit 2026-01-22 23:12:47 -08:00
shamoon
046d65c2ba Just light mode
[ci skip]
2026-01-22 22:59:27 -08:00
shamoon
8761816635 Update urls.py 2026-01-22 22:55:30 -08:00
shamoon
a1cdc45f1a one-time code 2026-01-22 22:39:11 -08:00
shamoon
190e42e722 Oh nice, reuse existing 2026-01-22 22:14:09 -08:00
shamoon
75c6ffe01f fix export dir
[ci skip]
2026-01-22 22:09:59 -08:00
shamoon
2964b4b256 Update migration_home.html
[ci skip]
2026-01-22 22:07:12 -08:00
shamoon
f52f9dd325 Basic login styling 2026-01-22 21:59:13 -08:00
shamoon
5827a0ec25 Disable unusable buttons 2026-01-22 21:59:12 -08:00
shamoon
990ef05d99 Some prettiness 2026-01-22 21:59:12 -08:00
shamoon
9f48b8e6e1 Some styling 2026-01-22 21:40:08 -08:00
shamoon
42689070b3 Still support conf 2026-01-22 21:40:08 -08:00
shamoon
09f3cfdb93 Start in migrator 2026-01-22 21:40:07 -08:00
shamoon
84f408fa43 save this, it does work 2026-01-22 21:40:07 -08:00
38 changed files with 2736 additions and 1236 deletions

View File

@@ -8,6 +8,11 @@ echo "${log_prefix} Apply database migrations..."
cd "${PAPERLESS_SRC_DIR}" cd "${PAPERLESS_SRC_DIR}"
if [[ "${PAPERLESS_MIGRATION_MODE:-0}" == "1" ]]; then
echo "${log_prefix} Migration mode enabled, skipping migrations."
exit 0
fi
# The whole migrate, with flock, needs to run as the right user # The whole migrate, with flock, needs to run as the right user
if [[ -n "${USER_IS_NON_ROOT}" ]]; then if [[ -n "${USER_IS_NON_ROOT}" ]]; then
exec s6-setlock -n "${data_dir}/migration_lock" python3 manage.py migrate --skip-checks --no-input exec s6-setlock -n "${data_dir}/migration_lock" python3 manage.py migrate --skip-checks --no-input

View File

@@ -9,7 +9,15 @@ echo "${log_prefix} Running Django checks"
cd "${PAPERLESS_SRC_DIR}" cd "${PAPERLESS_SRC_DIR}"
if [[ -n "${USER_IS_NON_ROOT}" ]]; then if [[ -n "${USER_IS_NON_ROOT}" ]]; then
python3 manage.py check if [[ "${PAPERLESS_MIGRATION_MODE:-0}" == "1" ]]; then
python3 manage_migration.py check
else
python3 manage.py check
fi
else else
s6-setuidgid paperless python3 manage.py check if [[ "${PAPERLESS_MIGRATION_MODE:-0}" == "1" ]]; then
s6-setuidgid paperless python3 manage_migration.py check
else
s6-setuidgid paperless python3 manage.py check
fi
fi fi

View File

@@ -13,8 +13,14 @@ if [[ -n "${PAPERLESS_FORCE_SCRIPT_NAME}" ]]; then
export GRANIAN_URL_PATH_PREFIX=${PAPERLESS_FORCE_SCRIPT_NAME} export GRANIAN_URL_PATH_PREFIX=${PAPERLESS_FORCE_SCRIPT_NAME}
fi fi
if [[ -n "${USER_IS_NON_ROOT}" ]]; then if [[ "${PAPERLESS_MIGRATION_MODE:-0}" == "1" ]]; then
exec granian --interface asginl --ws --loop uvloop "paperless.asgi:application" app_module="paperless.migration_asgi:application"
else else
exec s6-setuidgid paperless granian --interface asginl --ws --loop uvloop "paperless.asgi:application" app_module="paperless.asgi:application"
fi
if [[ -n "${USER_IS_NON_ROOT}" ]]; then
exec granian --interface asginl --ws --loop uvloop "${app_module}"
else
exec s6-setuidgid paperless granian --interface asginl --ws --loop uvloop "${app_module}"
fi fi

View File

@@ -481,147 +481,3 @@ To get started:
5. The project is ready for debugging, start either run the fullstack debug or individual debug 5. The project is ready for debugging, start either run the fullstack debug or individual debug
processes. Yo spin up the project without debugging run the task **Project Start: Run all Services** processes. Yo spin up the project without debugging run the task **Project Start: Run all Services**
## Developing Date Parser Plugins
Paperless-ngx uses a plugin system for date parsing, allowing you to extend or replace the default date parsing behavior. Plugins are discovered using [Python entry points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
### Creating a Date Parser Plugin
To create a custom date parser plugin, you need to:
1. Create a class that inherits from `DateParserPluginBase`
2. Implement the required abstract method
3. Register your plugin via an entry point
#### 1. Implementing the Parser Class
Your parser must extend `documents.plugins.date_parsing.DateParserPluginBase` and implement the `parse` method:
```python
from collections.abc import Iterator
import datetime
from documents.plugins.date_parsing import DateParserPluginBase
class MyDateParserPlugin(DateParserPluginBase):
"""
Custom date parser implementation.
"""
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
"""
Parse dates from the document's filename and content.
Args:
filename: The original filename of the document
content: The extracted text content of the document
Yields:
datetime.datetime: Valid datetime objects found in the document
"""
# Your parsing logic here
# Use self.config to access configuration settings
# Example: parse dates from filename first
if self.config.filename_date_order:
# Your filename parsing logic
yield some_datetime
# Then parse dates from content
# Your content parsing logic
yield another_datetime
```
#### 2. Configuration and Helper Methods
Your parser instance is initialized with a `DateParserConfig` object accessible via `self.config`. This provides:
- `languages: list[str]` - List of language codes for date parsing
- `timezone_str: str` - Timezone string for date localization
- `ignore_dates: set[datetime.date]` - Dates that should be filtered out
- `reference_time: datetime.datetime` - Current time for filtering future dates
- `filename_date_order: str | None` - Date order preference for filenames (e.g., "DMY", "MDY")
- `content_date_order: str` - Date order preference for content
The base class provides two helper methods you can use:
```python
def _parse_string(
self,
date_string: str,
date_order: str,
) -> datetime.datetime | None:
"""
Parse a single date string using dateparser with configured settings.
"""
def _filter_date(
self,
date: datetime.datetime | None,
) -> datetime.datetime | None:
"""
Validate a parsed datetime against configured rules.
Filters out dates before 1900, future dates, and ignored dates.
"""
```
#### 3. Resource Management (Optional)
If your plugin needs to acquire or release resources (database connections, API clients, etc.), override the context manager methods. Paperless-ngx will always use plugins as context managers, ensuring resources can be released even in the event of errors.
#### 4. Registering Your Plugin
Register your plugin using a setuptools entry point in your package's `pyproject.toml`:
```toml
[project.entry-points."paperless_ngx.date_parsers"]
my_parser = "my_package.parsers:MyDateParserPlugin"
```
The entry point name (e.g., `"my_parser"`) is used for sorting when multiple plugins are found. Paperless-ngx will use the first plugin alphabetically by name if multiple plugins are discovered.
### Plugin Discovery
Paperless-ngx automatically discovers and loads date parser plugins at runtime. The discovery process:
1. Queries the `paperless_ngx.date_parsers` entry point group
2. Validates that each plugin is a subclass of `DateParserPluginBase`
3. Sorts valid plugins alphabetically by entry point name
4. Uses the first valid plugin, or falls back to the default `RegexDateParserPlugin` if none are found
If multiple plugins are installed, a warning is logged indicating which plugin was selected.
### Example: Simple Date Parser
Here's a minimal example that only looks for ISO 8601 dates:
```python
import datetime
import re
from collections.abc import Iterator
from documents.plugins.date_parsing.base import DateParserPluginBase
class ISODateParserPlugin(DateParserPluginBase):
"""
Parser that only matches ISO 8601 formatted dates (YYYY-MM-DD).
"""
ISO_REGEX = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
# Combine filename and content for searching
text = f"{filename} {content}"
for match in self.ISO_REGEX.finditer(text):
date_string = match.group(1)
# Use helper method to parse with configured timezone
date = self._parse_string(date_string, "YMD")
# Use helper method to validate the date
filtered_date = self._filter_date(date)
if filtered_date is not None:
yield filtered_date
```

View File

@@ -49,6 +49,8 @@ dependencies = [
"flower~=2.0.1", "flower~=2.0.1",
"gotenberg-client~=0.13.1", "gotenberg-client~=0.13.1",
"httpx-oauth~=0.16", "httpx-oauth~=0.16",
"ijson",
"ijson~=3.3",
"imap-tools~=1.11.0", "imap-tools~=1.11.0",
"jinja2~=3.1.5", "jinja2~=3.1.5",
"langdetect~=1.0.9", "langdetect~=1.0.9",
@@ -72,6 +74,7 @@ dependencies = [
"rapidfuzz~=3.14.0", "rapidfuzz~=3.14.0",
"redis[hiredis]~=5.2.1", "redis[hiredis]~=5.2.1",
"regex>=2025.9.18", "regex>=2025.9.18",
"rich~=14.1.0",
"scikit-learn~=1.7.0", "scikit-learn~=1.7.0",
"sentence-transformers>=4.1", "sentence-transformers>=4.1",
"setproctitle~=1.3.4", "setproctitle~=1.3.4",
@@ -306,7 +309,6 @@ markers = [
"gotenberg: Tests requiring Gotenberg service", "gotenberg: Tests requiring Gotenberg service",
"tika: Tests requiring Tika service", "tika: Tests requiring Tika service",
"greenmail: Tests requiring Greenmail service", "greenmail: Tests requiring Greenmail service",
"date_parsing: Tests which cover date parsing from content or filename",
] ]
[tool.pytest_env] [tool.pytest_env]
@@ -333,10 +335,6 @@ exclude_also = [
[tool.mypy] [tool.mypy]
mypy_path = "src" mypy_path = "src"
files = [
"src/documents/plugins/date_parsing",
"src/documents/tests/date_parsing",
]
plugins = [ plugins = [
"mypy_django_plugin.main", "mypy_django_plugin.main",
"mypy_drf_plugin.main", "mypy_drf_plugin.main",
@@ -348,28 +346,5 @@ disallow_untyped_defs = true
warn_redundant_casts = true warn_redundant_casts = true
warn_unused_ignores = true warn_unused_ignores = true
# This prevents errors from imports, but allows type-checking logic to work
follow_imports = "silent"
[[tool.mypy.overrides]]
module = [
"documents.*",
"paperless.*",
"paperless_ai.*",
"paperless_mail.*",
"paperless_tesseract.*",
"paperless_remote.*",
"paperless_text.*",
"paperless_tika.*",
]
ignore_errors = true
[[tool.mypy.overrides]]
module = [
"documents.plugins.date_parsing.*",
"documents.tests.date_parsing.*",
]
ignore_errors = false
[tool.django-stubs] [tool.django-stubs]
django_settings_module = "paperless.settings" django_settings_module = "paperless.settings"

View File

@@ -32,12 +32,12 @@ from documents.models import WorkflowTrigger
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
from documents.parsers import ParseError from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date
from documents.permissions import set_permissions_for_object from documents.permissions import set_permissions_for_object
from documents.plugins.base import AlwaysRunPluginMixin from documents.plugins.base import AlwaysRunPluginMixin
from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import NoCleanupPluginMixin from documents.plugins.base import NoCleanupPluginMixin
from documents.plugins.base import NoSetupPluginMixin from documents.plugins.base import NoSetupPluginMixin
from documents.plugins.date_parsing import get_date_parser
from documents.plugins.helpers import ProgressManager from documents.plugins.helpers import ProgressManager
from documents.plugins.helpers import ProgressStatusOptions from documents.plugins.helpers import ProgressStatusOptions
from documents.signals import document_consumption_finished from documents.signals import document_consumption_finished
@@ -426,8 +426,7 @@ class ConsumerPlugin(
ProgressStatusOptions.WORKING, ProgressStatusOptions.WORKING,
ConsumerStatusShortMessage.PARSE_DATE, ConsumerStatusShortMessage.PARSE_DATE,
) )
with get_date_parser() as date_parser: date = parse_date(self.filename, text)
date = next(date_parser.parse(self.filename, text), None)
archive_path = document_parser.get_archive_path() archive_path = document_parser.get_archive_path()
page_count = document_parser.get_page_count(self.working_copy, mime_type) page_count = document_parser.get_page_count(self.working_copy, mime_type)

View File

@@ -9,17 +9,22 @@ import subprocess
import tempfile import tempfile
from functools import lru_cache from functools import lru_cache
from pathlib import Path from pathlib import Path
from re import Match
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from django.conf import settings from django.conf import settings
from django.utils import timezone
from documents.loggers import LoggingMixin from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess from documents.utils import run_subprocess
from paperless.config import OcrConfig
from paperless.utils import ocr_to_dateparser_languages
if TYPE_CHECKING: if TYPE_CHECKING:
import datetime import datetime
from collections.abc import Iterator
# This regular expression will try to find dates in the document at # This regular expression will try to find dates in the document at
# hand and will match the following formats: # hand and will match the following formats:
@@ -254,6 +259,75 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -
return out_path return out_path
def parse_date(filename, text) -> datetime.datetime | None:
return next(parse_date_generator(filename, text), None)
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
Returns the date of the document.
"""
def __parser(ds: str, date_order: str) -> datetime.datetime:
"""
Call dateparser.parse with a particular date ordering
"""
import dateparser
ocr_config = OcrConfig()
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
ocr_config.language,
)
return dateparser.parse(
ds,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": settings.TIME_ZONE,
},
locales=languages,
)
def __filter(date: datetime.datetime) -> datetime.datetime | None:
if (
date is not None
and date.year > 1900
and date <= timezone.now()
and date.date() not in settings.IGNORE_DATES
):
return date
return None
def __process_match(
match: Match[str],
date_order: str,
) -> datetime.datetime | None:
date_string = match.group(0)
try:
date = __parser(date_string, date_order)
except Exception:
# Skip all matches that do not parse to a proper date
date = None
return __filter(date)
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
for m in re.finditer(DATE_REGEX, content):
date = __process_match(m, date_order)
if date is not None:
yield date
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
# Iterate through all regex matches in text and try to parse the date
yield from __process_content(text, settings.DATE_ORDER)
class ParseError(Exception): class ParseError(Exception):
pass pass

View File

@@ -1,100 +0,0 @@
import logging
from functools import lru_cache
from importlib.metadata import EntryPoint
from importlib.metadata import entry_points
from typing import Final
from django.conf import settings
from django.utils import timezone
from documents.plugins.date_parsing.base import DateParserConfig
from documents.plugins.date_parsing.base import DateParserPluginBase
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
from paperless.utils import ocr_to_dateparser_languages
logger = logging.getLogger(__name__)
DATE_PARSER_ENTRY_POINT_GROUP: Final = "paperless_ngx.date_parsers"
@lru_cache(maxsize=1)
def _discover_parser_class() -> type[DateParserPluginBase]:
"""
Discovers the date parser plugin class to use.
- If one or more plugins are found, sorts them by name and returns the first.
- If no plugins are found, returns the default RegexDateParser.
"""
eps: tuple[EntryPoint, ...]
try:
eps = entry_points(group=DATE_PARSER_ENTRY_POINT_GROUP)
except Exception as e:
# Log a warning
logger.warning(f"Could not query entry points for date parsers: {e}")
eps = ()
valid_plugins: list[EntryPoint] = []
for ep in eps:
try:
plugin_class = ep.load()
if plugin_class and issubclass(plugin_class, DateParserPluginBase):
valid_plugins.append(ep)
else:
logger.warning(f"Plugin {ep.name} does not subclass DateParser.")
except Exception as e:
logger.error(f"Unable to load date parser plugin {ep.name}: {e}")
if not valid_plugins:
return RegexDateParserPlugin
valid_plugins.sort(key=lambda ep: ep.name)
if len(valid_plugins) > 1:
logger.warning(
f"Multiple date parsers found: "
f"{[ep.name for ep in valid_plugins]}. "
f"Using the first one by name: '{valid_plugins[0].name}'.",
)
return valid_plugins[0].load()
def get_date_parser() -> DateParserPluginBase:
"""
Factory function to get an initialized date parser instance.
This function is responsible for:
1. Discovering the correct parser class (plugin or default).
2. Loading configuration from Django settings.
3. Instantiating the parser with the configuration.
"""
# 1. Discover the class (this is cached)
parser_class = _discover_parser_class()
# 2. Load configuration from settings
# TODO: Get the language from the settings and/or configuration object, depending
languages = languages = (
settings.DATE_PARSER_LANGUAGES
or ocr_to_dateparser_languages(settings.OCR_LANGUAGE)
)
config = DateParserConfig(
languages=languages,
timezone_str=settings.TIME_ZONE,
ignore_dates=settings.IGNORE_DATES,
reference_time=timezone.now(),
filename_date_order=settings.FILENAME_DATE_ORDER,
content_date_order=settings.DATE_ORDER,
)
# 3. Instantiate the discovered class with the config
return parser_class(config=config)
__all__ = [
"DateParserConfig",
"DateParserPluginBase",
"RegexDateParserPlugin",
"get_date_parser",
]

View File

@@ -1,124 +0,0 @@
import datetime
import logging
from abc import ABC
from abc import abstractmethod
from collections.abc import Iterator
from dataclasses import dataclass
from types import TracebackType
try:
from typing import Self
except ImportError:
from typing_extensions import Self
import dateparser
logger = logging.getLogger(__name__)
@dataclass(frozen=True, slots=True)
class DateParserConfig:
"""
Configuration for a DateParser instance.
This object is created by the factory and passed to the
parser's constructor, decoupling the parser from settings.
"""
languages: list[str]
timezone_str: str
ignore_dates: set[datetime.date]
# A "now" timestamp for filtering future dates.
# Passed in by the factory.
reference_time: datetime.datetime
# Settings for the default RegexDateParser
# Other plugins should use or consider these, but it is not required
filename_date_order: str | None
content_date_order: str
class DateParserPluginBase(ABC):
"""
Abstract base class for date parsing strategies.
Instances are configured via a DateParserConfig object.
"""
def __init__(self, config: DateParserConfig):
"""
Initializes the parser with its configuration.
"""
self.config = config
def __enter__(self) -> Self:
"""
Enter the runtime context related to this object.
Subclasses can override this to acquire resources (connections, handles).
"""
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_val: BaseException | None,
exc_tb: TracebackType | None,
) -> None:
"""
Exit the runtime context related to this object.
Subclasses can override this to release resources.
"""
# Default implementation does nothing.
# Returning None implies exceptions are propagated.
def _parse_string(
self,
date_string: str,
date_order: str,
) -> datetime.datetime | None:
"""
Helper method to parse a single date string using dateparser.
Uses configuration from `self.config`.
"""
try:
return dateparser.parse(
date_string,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE": True,
"TIMEZONE": self.config.timezone_str,
},
locales=self.config.languages,
)
except Exception as e:
logger.error(f"Error while parsing date string '{date_string}': {e}")
return None
def _filter_date(
self,
date: datetime.datetime | None,
) -> datetime.datetime | None:
"""
Helper method to validate a parsed datetime object.
Uses configuration from `self.config`.
"""
if (
date is not None
and date.year > 1900
and date <= self.config.reference_time
and date.date() not in self.config.ignore_dates
):
return date
return None
@abstractmethod
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
"""
Parses a document's filename and content, yielding valid datetime objects.
"""

View File

@@ -1,65 +0,0 @@
import datetime
import re
from collections.abc import Iterator
from re import Match
from documents.plugins.date_parsing.base import DateParserPluginBase
class RegexDateParserPlugin(DateParserPluginBase):
"""
The default date parser, using a series of regular expressions.
It is configured entirely by the DateParserConfig object
passed to its constructor.
"""
DATE_REGEX = re.compile(
r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞğü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\d{1,2}[^ 0-9]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞğü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))",
re.IGNORECASE,
)
def _process_match(
self,
match: Match[str],
date_order: str,
) -> datetime.datetime | None:
"""
Processes a single regex match using the base class helpers.
"""
date_string = match.group(0)
date = self._parse_string(date_string, date_order)
return self._filter_date(date)
def _process_content(
self,
content: str,
date_order: str,
) -> Iterator[datetime.datetime]:
"""
Finds all regex matches in content and yields valid dates.
"""
for m in re.finditer(self.DATE_REGEX, content):
date = self._process_match(m, date_order)
if date is not None:
yield date
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
"""
Implementation of the abstract parse method.
Reads its configuration from `self.config`.
"""
if self.config.filename_date_order:
yield from self._process_content(
filename,
self.config.filename_date_order,
)
yield from self._process_content(content, self.config.content_date_order)

View File

@@ -1,82 +0,0 @@
import datetime
from collections.abc import Generator
from typing import Any
import pytest
import pytest_django
from documents.plugins.date_parsing import _discover_parser_class
from documents.plugins.date_parsing.base import DateParserConfig
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
@pytest.fixture
def base_config() -> DateParserConfig:
"""Basic configuration for date parser testing."""
return DateParserConfig(
languages=["en"],
timezone_str="UTC",
ignore_dates=set(),
reference_time=datetime.datetime(
2024,
1,
15,
12,
0,
0,
tzinfo=datetime.timezone.utc,
),
filename_date_order="YMD",
content_date_order="DMY",
)
@pytest.fixture
def config_with_ignore_dates() -> DateParserConfig:
"""Configuration with dates to ignore."""
return DateParserConfig(
languages=["en", "de"],
timezone_str="America/New_York",
ignore_dates={datetime.date(2024, 1, 1), datetime.date(2024, 12, 25)},
reference_time=datetime.datetime(
2024,
1,
15,
12,
0,
0,
tzinfo=datetime.timezone.utc,
),
filename_date_order="DMY",
content_date_order="MDY",
)
@pytest.fixture
def regex_parser(base_config: DateParserConfig) -> RegexDateParserPlugin:
"""Instance of RegexDateParser with base config."""
return RegexDateParserPlugin(base_config)
@pytest.fixture
def clear_lru_cache() -> Generator[None, None, None]:
"""
Ensure the LRU cache for _discover_parser_class is cleared
before and after any test that depends on it.
"""
_discover_parser_class.cache_clear()
yield
_discover_parser_class.cache_clear()
@pytest.fixture
def mock_date_parser_settings(settings: pytest_django.fixtures.SettingsWrapper) -> Any:
"""
Override Django settings for the duration of date parser tests.
"""
settings.DATE_PARSER_LANGUAGES = ["en", "de"]
settings.TIME_ZONE = "UTC"
settings.IGNORE_DATES = [datetime.date(1900, 1, 1)]
settings.FILENAME_DATE_ORDER = "YMD"
settings.DATE_ORDER = "DMY"
return settings

View File

@@ -1,228 +0,0 @@
import datetime
import logging
from collections.abc import Iterator
from importlib.metadata import EntryPoint
import pytest
import pytest_mock
from django.utils import timezone
from documents.plugins.date_parsing import DATE_PARSER_ENTRY_POINT_GROUP
from documents.plugins.date_parsing import _discover_parser_class
from documents.plugins.date_parsing import get_date_parser
from documents.plugins.date_parsing.base import DateParserConfig
from documents.plugins.date_parsing.base import DateParserPluginBase
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
class AlphaParser(DateParserPluginBase):
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
yield timezone.now()
class BetaParser(DateParserPluginBase):
def parse(self, filename: str, content: str) -> Iterator[datetime.datetime]:
yield timezone.now()
@pytest.mark.date_parsing
@pytest.mark.usefixtures("clear_lru_cache")
class TestDiscoverParserClass:
"""Tests for the _discover_parser_class() function."""
def test_returns_default_when_no_plugins_found(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(),
)
result = _discover_parser_class()
assert result is RegexDateParserPlugin
def test_returns_default_when_entrypoint_query_fails(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
mocker.patch(
"documents.plugins.date_parsing.entry_points",
side_effect=RuntimeError("boom"),
)
result = _discover_parser_class()
assert result is RegexDateParserPlugin
assert "Could not query entry points" in caplog.text
def test_filters_out_invalid_plugins(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
fake_ep = mocker.MagicMock(spec=EntryPoint)
fake_ep.name = "bad_plugin"
fake_ep.load.return_value = object # not subclass of DateParser
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(fake_ep,),
)
result = _discover_parser_class()
assert result is RegexDateParserPlugin
assert "does not subclass DateParser" in caplog.text
def test_skips_plugins_that_fail_to_load(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
fake_ep = mocker.MagicMock(spec=EntryPoint)
fake_ep.name = "failing_plugin"
fake_ep.load.side_effect = ImportError("cannot import")
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(fake_ep,),
)
result = _discover_parser_class()
assert result is RegexDateParserPlugin
assert "Unable to load date parser plugin failing_plugin" in caplog.text
def test_returns_single_valid_plugin_without_warning(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
"""If exactly one valid plugin is discovered, it should be returned without logging a warning."""
ep = mocker.MagicMock(spec=EntryPoint)
ep.name = "alpha"
ep.load.return_value = AlphaParser
mock_entry_points = mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(ep,),
)
with caplog.at_level(
logging.WARNING,
logger="documents.plugins.date_parsing",
):
result = _discover_parser_class()
# It should have called entry_points with the correct group
mock_entry_points.assert_called_once_with(group=DATE_PARSER_ENTRY_POINT_GROUP)
# The discovered class should be exactly our AlphaParser
assert result is AlphaParser
# No warnings should have been logged
assert not any(
"Multiple date parsers found" in record.message for record in caplog.records
), "Unexpected warning logged when only one plugin was found"
def test_returns_first_valid_plugin_by_name(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
ep_a = mocker.MagicMock(spec=EntryPoint)
ep_a.name = "alpha"
ep_a.load.return_value = AlphaParser
ep_b = mocker.MagicMock(spec=EntryPoint)
ep_b.name = "beta"
ep_b.load.return_value = BetaParser
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(ep_b, ep_a),
)
result = _discover_parser_class()
assert result is AlphaParser
def test_logs_warning_if_multiple_plugins_found(
self,
mocker: pytest_mock.MockerFixture,
caplog: pytest.LogCaptureFixture,
) -> None:
ep1 = mocker.MagicMock(spec=EntryPoint)
ep1.name = "a"
ep1.load.return_value = AlphaParser
ep2 = mocker.MagicMock(spec=EntryPoint)
ep2.name = "b"
ep2.load.return_value = BetaParser
mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(ep1, ep2),
)
with caplog.at_level(
logging.WARNING,
logger="documents.plugins.date_parsing",
):
result = _discover_parser_class()
# Should select alphabetically first plugin ("a")
assert result is AlphaParser
# Should log a warning mentioning multiple parsers
assert any(
"Multiple date parsers found" in record.message for record in caplog.records
), "Expected a warning about multiple date parsers"
def test_cache_behavior_only_runs_once(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
mock_entry_points = mocker.patch(
"documents.plugins.date_parsing.entry_points",
return_value=(),
)
# First call populates cache
_discover_parser_class()
# Second call should not re-invoke entry_points
_discover_parser_class()
mock_entry_points.assert_called_once()
@pytest.mark.date_parsing
@pytest.mark.usefixtures("mock_date_parser_settings")
class TestGetDateParser:
"""Tests for the get_date_parser() factory function."""
def test_returns_instance_of_discovered_class(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
mocker.patch(
"documents.plugins.date_parsing._discover_parser_class",
return_value=AlphaParser,
)
parser = get_date_parser()
assert isinstance(parser, AlphaParser)
assert isinstance(parser.config, DateParserConfig)
assert parser.config.languages == ["en", "de"]
assert parser.config.timezone_str == "UTC"
assert parser.config.ignore_dates == [datetime.date(1900, 1, 1)]
assert parser.config.filename_date_order == "YMD"
assert parser.config.content_date_order == "DMY"
# Check reference_time near now
delta = abs((parser.config.reference_time - timezone.now()).total_seconds())
assert delta < 2
def test_uses_default_regex_parser_when_no_plugins(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
mocker.patch(
"documents.plugins.date_parsing._discover_parser_class",
return_value=RegexDateParserPlugin,
)
parser = get_date_parser()
assert isinstance(parser, RegexDateParserPlugin)

View File

@@ -1,433 +0,0 @@
import datetime
import logging
from typing import Any
import pytest
import pytest_mock
from documents.plugins.date_parsing.base import DateParserConfig
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
@pytest.mark.date_parsing
class TestParseString:
"""Tests for DateParser._parse_string method via RegexDateParser."""
@pytest.mark.parametrize(
("date_string", "date_order", "expected_year"),
[
pytest.param("15/01/2024", "DMY", 2024, id="dmy_slash"),
pytest.param("01/15/2024", "MDY", 2024, id="mdy_slash"),
pytest.param("2024/01/15", "YMD", 2024, id="ymd_slash"),
pytest.param("January 15, 2024", "DMY", 2024, id="month_name_comma"),
pytest.param("15 Jan 2024", "DMY", 2024, id="day_abbr_month_year"),
pytest.param("15.01.2024", "DMY", 2024, id="dmy_dot"),
pytest.param("2024-01-15", "YMD", 2024, id="ymd_dash"),
],
)
def test_parse_string_valid_formats(
self,
regex_parser: RegexDateParserPlugin,
date_string: str,
date_order: str,
expected_year: int,
) -> None:
"""Should correctly parse various valid date formats."""
result = regex_parser._parse_string(date_string, date_order)
assert result is not None
assert result.year == expected_year
@pytest.mark.parametrize(
"invalid_string",
[
pytest.param("not a date", id="plain_text"),
pytest.param("32/13/2024", id="invalid_day_month"),
pytest.param("", id="empty_string"),
pytest.param("abc123xyz", id="alphanumeric_gibberish"),
pytest.param("99/99/9999", id="out_of_range"),
],
)
def test_parse_string_invalid_input(
self,
regex_parser: RegexDateParserPlugin,
invalid_string: str,
) -> None:
"""Should return None for invalid date strings."""
result = regex_parser._parse_string(invalid_string, "DMY")
assert result is None
def test_parse_string_handles_exceptions(
self,
caplog: pytest.LogCaptureFixture,
mocker: pytest_mock.MockerFixture,
regex_parser: RegexDateParserPlugin,
) -> None:
"""Should handle and log exceptions from dateparser gracefully."""
with caplog.at_level(
logging.ERROR,
logger="documents.plugins.date_parsing.base",
):
# We still need to mock dateparser.parse to force the exception
mocker.patch(
"documents.plugins.date_parsing.base.dateparser.parse",
side_effect=ValueError(
"Parsing error: 01/01/2024",
),
)
# 1. Execute the function under test
result = regex_parser._parse_string("01/01/2024", "DMY")
assert result is None
# Check if an error was logged
assert len(caplog.records) == 1
assert caplog.records[0].levelname == "ERROR"
# Check if the specific error message is present
assert "Error while parsing date string" in caplog.text
# Optional: Check for the exact exception message if it's included in the log
assert "Parsing error: 01/01/2024" in caplog.text
@pytest.mark.date_parsing
class TestFilterDate:
"""Tests for DateParser._filter_date method via RegexDateParser."""
@pytest.mark.parametrize(
("date", "expected_output"),
[
# Valid Dates
pytest.param(
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
datetime.datetime(2024, 1, 10, tzinfo=datetime.timezone.utc),
id="valid_past_date",
),
pytest.param(
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
datetime.datetime(2024, 1, 15, 12, 0, 0, tzinfo=datetime.timezone.utc),
id="exactly_at_reference",
),
pytest.param(
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
datetime.datetime(1901, 1, 1, tzinfo=datetime.timezone.utc),
id="year_1901_valid",
),
# Date is > reference_time
pytest.param(
datetime.datetime(2024, 1, 16, tzinfo=datetime.timezone.utc),
None,
id="future_date_day_after",
),
# date.date() in ignore_dates
pytest.param(
datetime.datetime(2024, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc),
None,
id="ignored_date_midnight_jan1",
),
pytest.param(
datetime.datetime(2024, 1, 1, 10, 30, 0, tzinfo=datetime.timezone.utc),
None,
id="ignored_date_midday_jan1",
),
pytest.param(
datetime.datetime(2024, 12, 25, 15, 0, 0, tzinfo=datetime.timezone.utc),
None,
id="ignored_date_dec25_future",
),
# date.year <= 1900
pytest.param(
datetime.datetime(1899, 12, 31, tzinfo=datetime.timezone.utc),
None,
id="year_1899",
),
pytest.param(
datetime.datetime(1900, 1, 1, tzinfo=datetime.timezone.utc),
None,
id="year_1900_boundary",
),
# date is None
pytest.param(None, None, id="none_input"),
],
)
def test_filter_date_validation_rules(
self,
config_with_ignore_dates: DateParserConfig,
date: datetime.datetime | None,
expected_output: datetime.datetime | None,
) -> None:
"""Should correctly validate dates against various rules."""
parser = RegexDateParserPlugin(config_with_ignore_dates)
result = parser._filter_date(date)
assert result == expected_output
def test_filter_date_respects_ignore_dates(
self,
config_with_ignore_dates: DateParserConfig,
) -> None:
"""Should filter out dates in the ignore_dates set."""
parser = RegexDateParserPlugin(config_with_ignore_dates)
ignored_date = datetime.datetime(
2024,
1,
1,
12,
0,
tzinfo=datetime.timezone.utc,
)
another_ignored = datetime.datetime(
2024,
12,
25,
15,
30,
tzinfo=datetime.timezone.utc,
)
allowed_date = datetime.datetime(
2024,
1,
2,
12,
0,
tzinfo=datetime.timezone.utc,
)
assert parser._filter_date(ignored_date) is None
assert parser._filter_date(another_ignored) is None
assert parser._filter_date(allowed_date) == allowed_date
def test_filter_date_timezone_aware(
self,
regex_parser: RegexDateParserPlugin,
) -> None:
"""Should work with timezone-aware datetimes."""
date_utc = datetime.datetime(2024, 1, 10, 12, 0, tzinfo=datetime.timezone.utc)
result = regex_parser._filter_date(date_utc)
assert result is not None
assert result.tzinfo is not None
@pytest.mark.date_parsing
class TestRegexDateParser:
@pytest.mark.parametrize(
("filename", "content", "expected"),
[
pytest.param(
"report-2023-12-25.txt",
"Event recorded on 25/12/2022.",
[
datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc),
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
],
id="filename-y-m-d_and_content-d-m-y",
),
pytest.param(
"img_2023.01.02.jpg",
"Taken on 01/02/2023",
[
datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc),
datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc),
],
id="ambiguous-dates-respect-orders",
),
pytest.param(
"notes.txt",
"bad date 99/99/9999 and 25/12/2022",
[
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
],
id="parse-exception-skips-bad-and-yields-good",
),
],
)
def test_parse_returns_expected_dates(
self,
base_config: DateParserConfig,
mocker: pytest_mock.MockerFixture,
filename: str,
content: str,
expected: list[datetime.datetime],
) -> None:
"""
High-level tests that exercise RegexDateParser.parse only.
dateparser.parse is mocked so tests are deterministic.
"""
parser = RegexDateParserPlugin(base_config)
# Patch the dateparser.parse
target = "documents.plugins.date_parsing.base.dateparser.parse"
def fake_parse(
date_string: str,
settings: dict[str, Any] | None = None,
locales: None = None,
) -> datetime.datetime | None:
date_order = settings.get("DATE_ORDER") if settings else None
# Filename-style YYYY-MM-DD / YYYY.MM.DD
if (
"2023-12-25" in date_string
or "2023.12.25" in date_string
or "2023-12-25" in date_string
):
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
# content DMY 25/12/2022
if "25/12/2022" in date_string or "25-12-2022" in date_string:
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
# filename YMD 2023.01.02
if "2023.01.02" in date_string or "2023-01-02" in date_string:
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
# ambiguous 01/02/2023 -> respect DATE_ORDER setting
if "01/02/2023" in date_string:
if date_order == "DMY":
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
if date_order == "YMD":
return datetime.datetime(2023, 1, 2, tzinfo=datetime.timezone.utc)
# fallback
return datetime.datetime(2023, 2, 1, tzinfo=datetime.timezone.utc)
# simulate parse failure for malformed input
if "99/99/9999" in date_string or "bad date" in date_string:
raise Exception("parse failed for malformed date")
return None
mocker.patch(target, side_effect=fake_parse)
results = list(parser.parse(filename, content))
assert results == expected
for dt in results:
assert dt.tzinfo is not None
def test_parse_filters_future_and_ignored_dates(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
"""
Ensure parser filters out:
- dates after reference_time
- dates whose .date() are in ignore_dates
"""
cfg = DateParserConfig(
languages=["en"],
timezone_str="UTC",
ignore_dates={datetime.date(2023, 12, 10)},
reference_time=datetime.datetime(
2024,
1,
15,
12,
0,
0,
tzinfo=datetime.timezone.utc,
),
filename_date_order="YMD",
content_date_order="DMY",
)
parser = RegexDateParserPlugin(cfg)
target = "documents.plugins.date_parsing.base.dateparser.parse"
def fake_parse(
date_string: str,
settings: dict[str, Any] | None = None,
locales: None = None,
) -> datetime.datetime | None:
if "10/12/2023" in date_string or "10-12-2023" in date_string:
# ignored date
return datetime.datetime(2023, 12, 10, tzinfo=datetime.timezone.utc)
if "01/02/2024" in date_string or "01-02-2024" in date_string:
# future relative to reference_time -> filtered
return datetime.datetime(2024, 2, 1, tzinfo=datetime.timezone.utc)
if "05/01/2023" in date_string or "05-01-2023" in date_string:
# valid
return datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)
return None
mocker.patch(target, side_effect=fake_parse)
content = "Ignored: 10/12/2023, Future: 01/02/2024, Keep: 05/01/2023"
results = list(parser.parse("whatever.txt", content))
assert results == [datetime.datetime(2023, 1, 5, tzinfo=datetime.timezone.utc)]
def test_parse_handles_no_matches_and_returns_empty_list(
self,
base_config: DateParserConfig,
) -> None:
"""
When there are no matching date-like substrings, parse should yield nothing.
"""
parser = RegexDateParserPlugin(base_config)
results = list(
parser.parse("no-dates.txt", "this has no dates whatsoever"),
)
assert results == []
def test_parse_skips_filename_when_filename_date_order_none(
self,
mocker: pytest_mock.MockerFixture,
) -> None:
"""
When filename_date_order is None the parser must not attempt to parse the filename.
Only dates found in the content should be passed to dateparser.parse.
"""
cfg = DateParserConfig(
languages=["en"],
timezone_str="UTC",
ignore_dates=set(),
reference_time=datetime.datetime(
2024,
1,
15,
12,
0,
0,
tzinfo=datetime.timezone.utc,
),
filename_date_order=None,
content_date_order="DMY",
)
parser = RegexDateParserPlugin(cfg)
# Patch the module's dateparser.parse so we can inspect calls
target = "documents.plugins.date_parsing.base.dateparser.parse"
def fake_parse(
date_string: str,
settings: dict[str, Any] | None = None,
locales: None = None,
) -> datetime.datetime | None:
# return distinct datetimes so we can tell which source was parsed
if "25/12/2022" in date_string:
return datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc)
if "2023-12-25" in date_string:
return datetime.datetime(2023, 12, 25, tzinfo=datetime.timezone.utc)
return None
mock = mocker.patch(target, side_effect=fake_parse)
filename = "report-2023-12-25.txt"
content = "Event recorded on 25/12/2022."
results = list(parser.parse(filename, content))
# Only the content date should have been parsed -> one call
assert mock.call_count == 1
# # first call, first positional arg
called_date_string = mock.call_args_list[0][0][0]
assert "25/12/2022" in called_date_string
# And the parser should have yielded the corresponding datetime
assert results == [
datetime.datetime(2022, 12, 25, tzinfo=datetime.timezone.utc),
]

View File

@@ -1978,11 +1978,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
@mock.patch("documents.views.get_date_parser") @mock.patch("documents.parsers.parse_date_generator")
@override_settings(NUMBER_OF_SUGGESTED_DATES=0) @override_settings(NUMBER_OF_SUGGESTED_DATES=0)
def test_get_suggestions_dates_disabled( def test_get_suggestions_dates_disabled(
self, self,
mock_get_date_parser: mock.MagicMock, parse_date_generator,
): ):
""" """
GIVEN: GIVEN:
@@ -1999,8 +1999,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
) )
self.client.get(f"/api/documents/{doc.pk}/suggestions/") self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertFalse(parse_date_generator.called)
mock_get_date_parser.assert_not_called()
def test_saved_views(self): def test_saved_views(self):
u1 = User.objects.create_superuser("user1") u1 = User.objects.create_superuser("user1")

View File

@@ -0,0 +1,538 @@
import datetime
from zoneinfo import ZoneInfo
import pytest
from pytest_django.fixtures import SettingsWrapper
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
@pytest.mark.django_db()
class TestDate:
def test_date_format_1(self):
text = "lorem ipsum 130218 lorem ipsum"
assert parse_date("", text) is None
def test_date_format_2(self):
text = "lorem ipsum 2018 lorem ipsum"
assert parse_date("", text) is None
def test_date_format_3(self):
text = "lorem ipsum 20180213 lorem ipsum"
assert parse_date("", text) is None
def test_date_format_4(self, settings_timezone: ZoneInfo):
text = "lorem ipsum 13.02.2018 lorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
def test_date_format_5(self, settings_timezone: ZoneInfo):
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
def test_date_format_6(self):
text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum"
)
assert parse_date("", text) is None
def test_date_format_7(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
date = parse_date("", text)
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
def test_date_format_8(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["de"]
text = (
"lorem ipsum\n"
"Wohnort\n"
"3100\n"
"IBAN\n"
"AT87 4534\n"
"1234\n"
"1234 5678\n"
"BIC\n"
"lorem ipsum\n"
"März 2020"
)
assert parse_date("", text) == datetime.datetime(
2020,
3,
1,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_9(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["de"]
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
assert parse_date("", text) == datetime.datetime(
2020,
3,
1,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_10(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_11(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_12(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_13(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_14(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_15(self):
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
assert parse_date("", text) is None
def test_date_format_16(self):
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
assert parse_date("", text) is None
def test_date_format_17(self):
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
assert parse_date("", text) is None
def test_date_format_18(self):
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
assert parse_date("", text) is None
def test_date_format_19(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
21,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_20(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
22,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_21(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
2,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_22(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
23,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_23(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
24,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_24(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
21,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_25(self, settings_timezone: ZoneInfo):
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
assert parse_date("", text) == datetime.datetime(
2022,
3,
25,
0,
0,
tzinfo=settings_timezone,
)
def test_date_format_26(self, settings_timezone: ZoneInfo):
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
assert parse_date("", text) == datetime.datetime(
2019,
9,
25,
0,
0,
tzinfo=settings_timezone,
)
def test_crazy_date_past(self):
assert parse_date("", "01-07-0590 00:00:00") is None
def test_crazy_date_future(self):
assert parse_date("", "01-07-2350 00:00:00") is None
def test_crazy_date_with_spaces(self):
assert parse_date("", "20 408000l 2475") is None
def test_utf_month_names(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
assert parse_date("", "13 décembre 2023") == datetime.datetime(
2023,
12,
13,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "13 août 2022") == datetime.datetime(
2022,
8,
13,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "11 März 2020") == datetime.datetime(
2020,
3,
11,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
2018,
3,
17,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "1. veljače 2016.") == datetime.datetime(
2016,
2,
1,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "15. února 1985") == datetime.datetime(
1985,
2,
15,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "30. září 2011") == datetime.datetime(
2011,
9,
30,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "28. května 1990") == datetime.datetime(
1990,
5,
28,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "1. grudzień 1997") == datetime.datetime(
1997,
12,
1,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "17 Şubat 2024") == datetime.datetime(
2024,
2,
17,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
2012,
8,
30,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "17 Eylül 2000") == datetime.datetime(
2000,
9,
17,
0,
0,
tzinfo=settings_timezone,
)
assert parse_date("", "5. október 1992") == datetime.datetime(
1992,
10,
5,
0,
0,
tzinfo=settings_timezone,
)
def test_multiple_dates(self, settings_timezone: ZoneInfo):
text = """This text has multiple dates.
For example 02.02.2018, 22 July 2022 and December 2021.
But not 24-12-9999 because it's in the future..."""
dates = list(parse_date_generator("", text))
assert dates == [
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
datetime.datetime(
2022,
7,
22,
0,
0,
tzinfo=settings_timezone,
),
datetime.datetime(
2021,
12,
1,
0,
0,
tzinfo=settings_timezone,
),
]
def test_filename_date_parse_valid_ymd(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Year Month Day (YMD)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
settings.FILENAME_DATE_ORDER = "YMD"
assert parse_date(
"/tmp/Scan-2022-04-01.pdf",
"No date in here",
) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
def test_filename_date_parse_valid_dmy(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (DMY)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
settings.FILENAME_DATE_ORDER = "DMY"
assert parse_date(
"/tmp/Scan-10.01.2021.pdf",
"No date in here",
) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename includes no date
- File content includes no date
THEN:
- No date is parsed
"""
settings.FILENAME_DATE_ORDER = "YMD"
assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
def test_filename_date_ignored_use_content(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (YMD)
- Date order is Day Month Year (DMY, the default)
- Filename contains date matching the format
- Filename date is an ignored date
- File content includes a date
THEN:
- Should parse the date from the content not filename
"""
settings.FILENAME_DATE_ORDER = "YMD"
settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
assert parse_date(
"/tmp/Scan-2022-04-01.pdf",
"The matching date is 24.03.2022",
) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
def test_ignored_dates_default_order(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Ignore dates have been set
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
assert parse_date("", text) == datetime.datetime(
2018,
2,
13,
0,
0,
tzinfo=settings_timezone,
)
def test_ignored_dates_order_ymd(
self,
settings: SettingsWrapper,
settings_timezone: ZoneInfo,
):
"""
GIVEN:
- Ignore dates have been set
- Date order is Year Month Date (YMD)
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
settings.FILENAME_DATE_ORDER = "YMD"
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
assert parse_date("", text) == datetime.datetime(
2018,
2,
13,
0,
0,
tzinfo=settings_timezone,
)

View File

@@ -148,6 +148,7 @@ from documents.models import Workflow
from documents.models import WorkflowAction from documents.models import WorkflowAction
from documents.models import WorkflowTrigger from documents.models import WorkflowTrigger
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date_generator
from documents.permissions import AcknowledgeTasksPermissions from documents.permissions import AcknowledgeTasksPermissions
from documents.permissions import PaperlessAdminPermissions from documents.permissions import PaperlessAdminPermissions
from documents.permissions import PaperlessNotePermissions from documents.permissions import PaperlessNotePermissions
@@ -157,7 +158,6 @@ from documents.permissions import get_document_count_filter_for_user
from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import get_objects_for_user_owner_aware
from documents.permissions import has_perms_owner_aware from documents.permissions import has_perms_owner_aware
from documents.permissions import set_permissions_for_object from documents.permissions import set_permissions_for_object
from documents.plugins.date_parsing import get_date_parser
from documents.schema import generate_object_with_permissions_schema from documents.schema import generate_object_with_permissions_schema
from documents.serialisers import AcknowledgeTasksViewSerializer from documents.serialisers import AcknowledgeTasksViewSerializer
from documents.serialisers import BulkDownloadSerializer from documents.serialisers import BulkDownloadSerializer
@@ -1023,17 +1023,16 @@ class DocumentViewSet(
dates = [] dates = []
if settings.NUMBER_OF_SUGGESTED_DATES > 0: if settings.NUMBER_OF_SUGGESTED_DATES > 0:
with get_date_parser() as date_parser: gen = parse_date_generator(doc.filename, doc.content)
gen = date_parser.parse(doc.filename, doc.content) dates = sorted(
dates = sorted( {
{ i
i for i in itertools.islice(
for i in itertools.islice( gen,
gen, settings.NUMBER_OF_SUGGESTED_DATES,
settings.NUMBER_OF_SUGGESTED_DATES, )
) },
}, )
)
resp_data = { resp_data = {
"correspondents": [ "correspondents": [

View File

@@ -3,7 +3,12 @@ import os
import sys import sys
if __name__ == "__main__": if __name__ == "__main__":
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") try:
from paperless_migration.detect import choose_settings_module
os.environ.setdefault("DJANGO_SETTINGS_MODULE", choose_settings_module())
except Exception:
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
from django.core.management import execute_from_command_line from django.core.management import execute_from_command_line

13
src/manage_migration.py Executable file
View File

@@ -0,0 +1,13 @@
#!/usr/bin/env python3
import os
import sys
if __name__ == "__main__":
os.environ.setdefault(
"DJANGO_SETTINGS_MODULE",
"paperless_migration.settings",
)
from django.core.management import execute_from_command_line
execute_from_command_line(sys.argv)

View File

@@ -1,12 +1,18 @@
import os import os
try:
from paperless_migration.detect import choose_settings_module
os.environ.setdefault("DJANGO_SETTINGS_MODULE", choose_settings_module())
except Exception:
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
from django.core.asgi import get_asgi_application from django.core.asgi import get_asgi_application
# Fetch Django ASGI application early to ensure AppRegistry is populated # Fetch Django ASGI application early to ensure AppRegistry is populated
# before importing consumers and AuthMiddlewareStack that may import ORM # before importing consumers and AuthMiddlewareStack that may import ORM
# models. # models.
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
django_asgi_app = get_asgi_application() django_asgi_app = get_asgi_application()
from channels.auth import AuthMiddlewareStack # noqa: E402 from channels.auth import AuthMiddlewareStack # noqa: E402

View File

@@ -0,0 +1,7 @@
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless_migration.settings")
application = get_asgi_application()

View File

@@ -9,9 +9,14 @@ https://docs.djangoproject.com/en/1.10/howto/deployment/wsgi/
import os import os
from django.core.wsgi import get_wsgi_application try:
from paperless_migration.detect import choose_settings_module
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings") os.environ.setdefault("DJANGO_SETTINGS_MODULE", choose_settings_module())
except Exception:
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
from django.core.wsgi import get_wsgi_application
application = get_wsgi_application() application = get_wsgi_application()

View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class PaperlessMigrationConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "paperless_migration"

View File

@@ -0,0 +1,28 @@
"""ASGI application for migration mode with WebSocket support."""
from __future__ import annotations
import os
from channels.auth import AuthMiddlewareStack
from channels.routing import ProtocolTypeRouter
from channels.routing import URLRouter
from channels.security.websocket import AllowedHostsOriginValidator
from django.core.asgi import get_asgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless_migration.settings")
# Initialize Django ASGI application early to ensure settings are loaded
django_asgi_app = get_asgi_application()
# Import routing after Django is initialized
from paperless_migration.routing import websocket_urlpatterns # noqa: E402
application = ProtocolTypeRouter(
{
"http": django_asgi_app,
"websocket": AllowedHostsOriginValidator(
AuthMiddlewareStack(URLRouter(websocket_urlpatterns)),
),
},
)

View File

@@ -0,0 +1,245 @@
"""WebSocket consumers for migration operations."""
from __future__ import annotations
import json
import logging
import os
import shutil
import tempfile
from pathlib import Path
from typing import Any
from channels.generic.websocket import AsyncWebsocketConsumer
from django.conf import settings
from paperless_migration.services.importer import ImportService
from paperless_migration.services.transform import TransformService
logger = logging.getLogger(__name__)
class MigrationConsumerBase(AsyncWebsocketConsumer):
"""Base consumer with common authentication and messaging logic."""
async def connect(self) -> None:
"""Authenticate and accept or reject the connection."""
user = self.scope.get("user")
session = self.scope.get("session", {})
if not user or not user.is_authenticated:
logger.warning("WebSocket connection rejected: not authenticated")
await self.close(code=4001)
return
if not user.is_superuser:
logger.warning("WebSocket connection rejected: not superuser")
await self.close(code=4003)
return
if not session.get("migration_code_ok"):
logger.warning("WebSocket connection rejected: migration code not verified")
await self.close(code=4002)
return
await self.accept()
logger.info("WebSocket connection accepted for user: %s", user.username)
async def disconnect(self, close_code: int) -> None:
"""Handle disconnection."""
logger.debug("WebSocket disconnected with code: %d", close_code)
async def receive(self, text_data: str | None = None, **kwargs: Any) -> None:
"""Handle incoming messages - triggers the operation."""
if text_data is None:
return
try:
data = json.loads(text_data)
except json.JSONDecodeError:
await self.send_error("Invalid JSON message")
return
action = data.get("action")
if action == "start":
await self.run_operation()
else:
await self.send_error(f"Unknown action: {action}")
async def run_operation(self) -> None:
"""Override in subclasses to run the specific operation."""
raise NotImplementedError
async def send_message(self, msg_type: str, **kwargs: Any) -> None:
"""Send a typed JSON message to the client."""
await self.send(text_data=json.dumps({"type": msg_type, **kwargs}))
async def send_log(self, message: str, level: str = "info") -> None:
"""Send a log message."""
await self.send_message("log", message=message, level=level)
async def send_progress(
self,
current: int,
total: int | None = None,
label: str = "",
) -> None:
"""Send a progress update."""
await self.send_message(
"progress",
current=current,
total=total,
label=label,
)
async def send_stats(self, stats: dict[str, Any]) -> None:
"""Send statistics update."""
await self.send_message("stats", **stats)
async def send_complete(
self,
duration: float,
*,
success: bool,
**kwargs: Any,
) -> None:
"""Send completion message."""
await self.send_message(
"complete",
success=success,
duration=duration,
**kwargs,
)
async def send_error(self, message: str) -> None:
"""Send an error message."""
await self.send_message("error", message=message)
class TransformConsumer(MigrationConsumerBase):
"""WebSocket consumer for transform operations."""
async def run_operation(self) -> None:
"""Run the transform operation."""
input_path = Path(settings.MIGRATION_EXPORT_PATH)
output_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
frequency = settings.MIGRATION_PROGRESS_FREQUENCY
if not input_path.exists():
await self.send_error(f"Export file not found: {input_path}")
return
if output_path.exists():
await self.send_error(
f"Output file already exists: {output_path}. "
"Delete it first to re-run transform.",
)
return
await self.send_log("Starting transform operation...")
service = TransformService(
input_path=input_path,
output_path=output_path,
update_frequency=frequency,
)
try:
async for update in service.run_async():
match update["type"]:
case "progress":
await self.send_progress(
current=update["completed"],
label=f"{update['completed']:,} rows processed",
)
if update.get("stats"):
await self.send_stats({"transformed": update["stats"]})
case "complete":
await self.send_complete(
success=True,
duration=update["duration"],
total_processed=update["total_processed"],
stats=update["stats"],
speed=update["speed"],
)
case "error":
await self.send_error(update["message"])
case "log":
await self.send_log(
update["message"],
update.get("level", "info"),
)
except Exception as exc:
logger.exception("Transform operation failed")
await self.send_error(f"Transform failed: {exc}")
class ImportConsumer(MigrationConsumerBase):
"""WebSocket consumer for import operations."""
async def run_operation(self) -> None:
"""Run the import operation (wipe, migrate, import)."""
export_path = Path(settings.MIGRATION_EXPORT_PATH)
transformed_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
imported_marker = Path(settings.MIGRATION_IMPORTED_PATH)
source_dir = export_path.parent
if not export_path.exists():
await self.send_error("Export file not found. Upload or re-check export.")
return
if not transformed_path.exists():
await self.send_error("Transformed file not found. Run transform first.")
return
await self.send_log("Preparing import operation...")
# Backup original manifest and swap in transformed version
backup_path: Path | None = None
try:
backup_fd, backup_name = tempfile.mkstemp(
prefix="manifest.v2.",
suffix=".json",
dir=source_dir,
)
os.close(backup_fd)
backup_path = Path(backup_name)
shutil.copy2(export_path, backup_path)
shutil.copy2(transformed_path, export_path)
await self.send_log("Manifest files prepared")
except Exception as exc:
await self.send_error(f"Failed to prepare import manifest: {exc}")
return
service = ImportService(
source_dir=source_dir,
imported_marker=imported_marker,
)
try:
async for update in service.run_async():
match update["type"]:
case "phase":
await self.send_log(f"Phase: {update['phase']}", level="info")
case "log":
await self.send_log(
update["message"],
update.get("level", "info"),
)
case "complete":
await self.send_complete(
success=update["success"],
duration=update["duration"],
)
case "error":
await self.send_error(update["message"])
except Exception as exc:
logger.exception("Import operation failed")
await self.send_error(f"Import failed: {exc}")
finally:
# Restore original manifest
if backup_path and backup_path.exists():
try:
shutil.move(str(backup_path), str(export_path))
except Exception as exc:
logger.warning("Failed to restore backup manifest: %s", exc)

View File

@@ -0,0 +1,150 @@
"""Lightweight detection to decide if we should boot migration mode."""
from __future__ import annotations
import logging
import os
import sqlite3
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
BASE_DIR = Path(__file__).resolve().parent.parent
_DOC_EXISTS_QUERY = "SELECT 1 FROM documents_document LIMIT 1;"
def _get_db_config() -> dict[str, Any]:
data_dir = Path(os.getenv("PAPERLESS_DATA_DIR", BASE_DIR.parent / "data")).resolve()
if not os.getenv("PAPERLESS_DBHOST"):
return {
"ENGINE": "sqlite",
"NAME": data_dir / "db.sqlite3",
}
engine = "mariadb" if os.getenv("PAPERLESS_DBENGINE") == "mariadb" else "postgres"
cfg = {
"ENGINE": engine,
"HOST": os.getenv("PAPERLESS_DBHOST"),
"PORT": os.getenv("PAPERLESS_DBPORT"),
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
"USER": os.getenv("PAPERLESS_DBUSER", "paperless"),
"PASSWORD": os.getenv("PAPERLESS_DBPASS", "paperless"),
}
return cfg
def _probe_sqlite(path: Path) -> bool:
if not path.exists():
return False
try:
conn = sqlite3.connect(path, timeout=1)
cur = conn.cursor()
cur.execute(_DOC_EXISTS_QUERY)
cur.fetchone()
return True
except sqlite3.Error:
return False
finally:
try:
conn.close()
except Exception:
pass
def _probe_postgres(cfg: dict[str, Any]) -> bool:
try:
import psycopg
except ImportError: # pragma: no cover
logger.debug("psycopg not installed; skipping postgres probe")
return False
try:
conn = psycopg.connect(
host=cfg["HOST"],
port=cfg["PORT"],
dbname=cfg["NAME"],
user=cfg["USER"],
password=cfg["PASSWORD"],
connect_timeout=2,
)
with conn, conn.cursor() as cur:
cur.execute(_DOC_EXISTS_QUERY)
cur.fetchone()
return True
except Exception:
return False
finally:
try:
conn.close()
except Exception:
pass
def _probe_mariadb(cfg: dict[str, Any]) -> bool:
try:
import MySQLdb # type: ignore
except ImportError: # pragma: no cover
logger.debug("mysqlclient not installed; skipping mariadb probe")
return False
try:
conn = MySQLdb.connect(
host=cfg["HOST"],
port=int(cfg["PORT"] or 3306),
user=cfg["USER"],
passwd=cfg["PASSWORD"],
db=cfg["NAME"],
connect_timeout=2,
)
cur = conn.cursor()
cur.execute("SELECT 1 FROM documents_document LIMIT 1;")
cur.fetchone()
return True
except Exception:
return False
finally:
try:
conn.close()
except Exception:
pass
def is_v2_database() -> bool:
cfg = _get_db_config()
if cfg["ENGINE"] == "sqlite":
return _probe_sqlite(cfg["NAME"])
if cfg["ENGINE"] == "postgres":
return _probe_postgres(cfg)
if cfg["ENGINE"] == "mariadb":
return _probe_mariadb(cfg)
return False
def choose_settings_module() -> str:
# ENV override
toggle = os.getenv("PAPERLESS_MIGRATION_MODE")
if toggle is not None:
chosen = (
"paperless_migration.settings"
if str(toggle).lower() in ("1", "true", "yes", "on")
else "paperless.settings"
)
os.environ["PAPERLESS_MIGRATION_MODE"] = "1" if "migration" in chosen else "0"
return chosen
# Auto-detect via DB probe
if is_v2_database():
logger.warning("Detected v2 schema; booting migration mode.")
os.environ["PAPERLESS_MIGRATION_MODE"] = "1"
return "paperless_migration.settings"
os.environ["PAPERLESS_MIGRATION_MODE"] = "0"
return "paperless.settings"
if __name__ == "__main__": # pragma: no cover
logger.info(
"v2 database detected" if is_v2_database() else "v2 database not detected",
)

View File

@@ -0,0 +1,13 @@
"""WebSocket URL routing for migration operations."""
from __future__ import annotations
from django.urls import path
from paperless_migration.consumers import ImportConsumer
from paperless_migration.consumers import TransformConsumer
websocket_urlpatterns = [
path("ws/migration/transform/", TransformConsumer.as_asgi()),
path("ws/migration/import/", ImportConsumer.as_asgi()),
]

View File

@@ -0,0 +1,186 @@
"""Import service for loading transformed data into v3 database."""
from __future__ import annotations
import subprocess
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TypedDict
if TYPE_CHECKING:
from collections.abc import AsyncGenerator
from collections.abc import Generator
class ProgressUpdate(TypedDict, total=False):
"""Progress update message structure."""
type: str
phase: str
message: str
level: str
success: bool
duration: float
return_code: int
@dataclass
class ImportService:
"""Service for importing transformed data into v3 database.
This service orchestrates the three-phase import process:
1. Wipe the existing database
2. Run Django migrations for v3 schema
3. Import the transformed data
"""
source_dir: Path
imported_marker: Path
manage_path: Path | None = None
def __post_init__(self) -> None:
if self.manage_path is None:
# Default to manage.py in the src directory
self.manage_path = (
Path(__file__).resolve().parent.parent.parent / "manage.py"
)
def _get_env(self) -> dict[str, str]:
"""Get environment variables for subprocess calls."""
import os
env = os.environ.copy()
env["DJANGO_SETTINGS_MODULE"] = "paperless.settings"
env["PAPERLESS_MIGRATION_MODE"] = "0"
return env
def _run_command(
self,
args: list[str],
label: str,
) -> Generator[ProgressUpdate, None, int]:
"""Run a command and yield log lines. Returns the return code."""
yield {"type": "log", "message": f"Running: {label}", "level": "info"}
process = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
text=True,
env=self._get_env(),
)
try:
if process.stdout:
for line in process.stdout:
yield {
"type": "log",
"message": line.rstrip(),
"level": "info",
}
process.wait()
return process.returncode
finally:
if process.poll() is None:
process.kill()
def run_sync(self) -> Generator[ProgressUpdate, None, None]:
"""Run the import synchronously, yielding progress updates.
This orchestrates:
1. Database wipe
2. Django migrations
3. Document import
"""
start_time = time.perf_counter()
# Phase 1: Wipe database
yield {"type": "phase", "phase": "wipe"}
wipe_cmd = [
sys.executable,
"-m",
"paperless_migration.services.wipe_db",
]
wipe_code = yield from self._run_command(wipe_cmd, "Database wipe")
if wipe_code != 0:
yield {
"type": "error",
"message": f"Database wipe failed with code {wipe_code}",
}
return
yield {"type": "log", "message": "Database wipe complete", "level": "info"}
# Phase 2: Run migrations
yield {"type": "phase", "phase": "migrate"}
migrate_cmd = [
sys.executable,
str(self.manage_path),
"migrate",
"--noinput",
]
migrate_code = yield from self._run_command(migrate_cmd, "Django migrations")
if migrate_code != 0:
yield {
"type": "error",
"message": f"Migrations failed with code {migrate_code}",
}
return
yield {"type": "log", "message": "Migrations complete", "level": "info"}
# Phase 3: Import data
yield {"type": "phase", "phase": "import"}
import_cmd = [
sys.executable,
str(self.manage_path),
"document_importer",
str(self.source_dir),
"--data-only",
]
import_code = yield from self._run_command(import_cmd, "Document import")
if import_code != 0:
yield {
"type": "error",
"message": f"Import failed with code {import_code}",
}
return
# Mark import as complete
try:
self.imported_marker.parent.mkdir(parents=True, exist_ok=True)
self.imported_marker.write_text("ok\n", encoding="utf-8")
except Exception as exc:
yield {
"type": "log",
"message": f"Warning: Could not write import marker: {exc}",
"level": "warning",
}
end_time = time.perf_counter()
duration = end_time - start_time
yield {
"type": "complete",
"success": True,
"duration": duration,
}
async def run_async(self) -> AsyncGenerator[ProgressUpdate, None]:
"""Run the import asynchronously, yielding progress updates.
This wraps the synchronous implementation to work with async consumers.
"""
import asyncio
for update in self.run_sync():
yield update
# Yield control to the event loop
await asyncio.sleep(0)

View File

@@ -0,0 +1,173 @@
"""Transform service for converting v2 exports to v3 format."""
from __future__ import annotations
import json
import time
from collections import Counter
from collections.abc import AsyncGenerator
from collections.abc import Callable
from collections.abc import Generator
from dataclasses import dataclass
from dataclasses import field
from typing import TYPE_CHECKING
from typing import Any
from typing import TypedDict
import ijson
if TYPE_CHECKING:
from pathlib import Path
class FixtureObject(TypedDict):
"""Structure of a Django fixture object."""
model: str
pk: int
fields: dict[str, Any]
class ProgressUpdate(TypedDict, total=False):
"""Progress update message structure."""
type: str
completed: int
stats: dict[str, int]
message: str
level: str
duration: float
total_processed: int
speed: float
TransformFn = Callable[[FixtureObject], FixtureObject]
def transform_documents_document(obj: FixtureObject) -> FixtureObject:
"""Transform a documents.document fixture object for v3 schema."""
fields: dict[str, Any] = obj["fields"]
fields.pop("storage_type", None)
content: Any = fields.get("content")
fields["content_length"] = len(content) if isinstance(content, str) else 0
return obj
# Registry of model-specific transforms
TRANSFORMS: dict[str, TransformFn] = {
"documents.document": transform_documents_document,
}
@dataclass
class TransformService:
"""Service for transforming v2 exports to v3 format.
This service processes JSON fixtures incrementally using ijson for
memory-efficient streaming, and yields progress updates suitable
for WebSocket transmission.
"""
input_path: Path
output_path: Path
update_frequency: int = 100
_stats: Counter[str] = field(default_factory=Counter, init=False)
_total_processed: int = field(default=0, init=False)
def validate(self) -> str | None:
"""Validate preconditions for transform. Returns error message or None."""
if not self.input_path.exists():
return f"Input file not found: {self.input_path}"
if self.output_path.exists():
return f"Output file already exists: {self.output_path}"
if self.input_path.resolve() == self.output_path.resolve():
return "Input and output paths cannot be the same file"
return None
def _process_fixture(self, obj: FixtureObject) -> FixtureObject:
"""Apply any registered transforms to a fixture object."""
model: str = obj["model"]
transform: TransformFn | None = TRANSFORMS.get(model)
if transform:
obj = transform(obj)
self._stats[model] += 1
return obj
def run_sync(self) -> Generator[ProgressUpdate, None, None]:
"""Run the transform synchronously, yielding progress updates.
This is the core implementation that processes the JSON file
and yields progress updates at regular intervals.
"""
error = self.validate()
if error:
yield {"type": "error", "message": error}
return
self._stats.clear()
self._total_processed = 0
start_time = time.perf_counter()
yield {"type": "log", "message": "Opening input file...", "level": "info"}
try:
with (
self.input_path.open("rb") as infile,
self.output_path.open("w", encoding="utf-8") as outfile,
):
outfile.write("[\n")
first = True
for i, obj in enumerate(ijson.items(infile, "item")):
fixture: FixtureObject = obj
fixture = self._process_fixture(fixture)
self._total_processed += 1
if not first:
outfile.write(",\n")
first = False
json.dump(fixture, outfile, ensure_ascii=False)
# Yield progress at configured frequency
if i > 0 and i % self.update_frequency == 0:
yield {
"type": "progress",
"completed": self._total_processed,
"stats": dict(self._stats),
}
outfile.write("\n]\n")
except Exception as exc:
# Clean up partial output on error
if self.output_path.exists():
self.output_path.unlink()
yield {"type": "error", "message": str(exc)}
return
end_time = time.perf_counter()
duration = end_time - start_time
speed = self._total_processed / duration if duration > 0 else 0
yield {
"type": "complete",
"duration": duration,
"total_processed": self._total_processed,
"stats": dict(self._stats),
"speed": speed,
}
async def run_async(self) -> AsyncGenerator[ProgressUpdate, None]:
"""Run the transform asynchronously, yielding progress updates.
This wraps the synchronous implementation to work with async consumers.
The actual I/O is done synchronously since ijson doesn't support async,
but we yield control periodically to keep the event loop responsive.
"""
import asyncio
for update in self.run_sync():
yield update
# Yield control to the event loop periodically
await asyncio.sleep(0)

View File

@@ -0,0 +1,115 @@
"""Database wipe service for migration import process.
This module can be run as a script via:
python -m paperless_migration.services.wipe_db
It uses the paperless_migration settings to wipe all tables
before running v3 migrations.
"""
from __future__ import annotations
import logging
import sys
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from django.db.backends.base.base import BaseDatabaseWrapper
logger = logging.getLogger(__name__)
def _get_target_tables(connection: BaseDatabaseWrapper) -> list[str]:
"""Get list of tables to drop that exist in the database."""
from django.apps import apps
from django.db.migrations.recorder import MigrationRecorder
model_tables = {
model._meta.db_table for model in apps.get_models(include_auto_created=True)
}
model_tables.add(MigrationRecorder.Migration._meta.db_table)
existing_tables = set(connection.introspection.table_names())
return sorted(model_tables & existing_tables)
def _drop_sqlite_tables(connection: BaseDatabaseWrapper) -> int:
"""Drop tables for SQLite database. Returns count of tables dropped."""
tables = _get_target_tables(connection)
with connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF;")
for table in tables:
cursor.execute(f'DROP TABLE IF EXISTS "{table}";')
cursor.execute("PRAGMA foreign_keys=ON;")
return len(tables)
def _drop_postgres_tables(connection: BaseDatabaseWrapper) -> int:
"""Drop tables for PostgreSQL database. Returns count of tables dropped."""
tables = _get_target_tables(connection)
if not tables:
return 0
with connection.cursor() as cursor:
for table in tables:
cursor.execute(f'DROP TABLE IF EXISTS "{table}" CASCADE;')
return len(tables)
def _drop_mysql_tables(connection: BaseDatabaseWrapper) -> int:
"""Drop tables for MySQL/MariaDB database. Returns count of tables dropped."""
tables = _get_target_tables(connection)
with connection.cursor() as cursor:
cursor.execute("SET FOREIGN_KEY_CHECKS=0;")
for table in tables:
cursor.execute(f"DROP TABLE IF EXISTS `{table}`;")
cursor.execute("SET FOREIGN_KEY_CHECKS=1;")
return len(tables)
def wipe_database() -> tuple[bool, str]:
"""Wipe all application tables from the database.
Returns:
Tuple of (success: bool, message: str)
"""
from django.db import connection
vendor = connection.vendor
logger.info("Wiping database for vendor: %s", vendor)
try:
match vendor:
case "sqlite":
count = _drop_sqlite_tables(connection)
case "postgresql":
count = _drop_postgres_tables(connection)
case "mysql":
count = _drop_mysql_tables(connection)
case _:
return False, f"Unsupported database vendor: {vendor}"
message = f"Dropped {count} tables from {vendor} database"
logger.info(message)
return True, message
except Exception as exc:
message = f"Failed to wipe database: {exc}"
logger.exception(message)
return False, message
def main() -> int:
"""Entry point when run as a script."""
import os
import django
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless_migration.settings")
django.setup()
success, message = wipe_database()
print(message) # noqa: T201
return 0 if success else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,245 @@
"""Settings for migration-mode Django instance."""
from __future__ import annotations
import logging
import os
from pathlib import Path
from typing import Any
from dotenv import load_dotenv
BASE_DIR = Path(__file__).resolve().parent.parent
DEBUG = os.getenv("PAPERLESS_DEBUG", "false").lower() == "true"
ALLOWED_HOSTS = ["*"]
# Tap paperless.conf if it's available
for path in [
os.getenv("PAPERLESS_CONFIGURATION_PATH"),
"../paperless.conf",
"/etc/paperless.conf",
"/usr/local/etc/paperless.conf",
]:
if path and Path(path).exists():
load_dotenv(path)
break
def __get_path(
key: str,
default: str | Path,
) -> Path:
if key in os.environ:
return Path(os.environ[key]).resolve()
return Path(default).resolve()
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", BASE_DIR.parent / "data")
EXPORT_DIR = __get_path("PAPERLESS_EXPORT_DIR", BASE_DIR.parent / "export")
def _parse_redis_url() -> str:
"""Parse Redis URL from environment with sensible defaults."""
return os.getenv("PAPERLESS_REDIS_URL", "redis://localhost:6379")
def _parse_db_settings() -> dict[str, dict[str, Any]]:
databases: dict[str, dict[str, Any]] = {
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": DATA_DIR / "db.sqlite3",
"OPTIONS": {},
},
}
if os.getenv("PAPERLESS_DBHOST"):
databases["sqlite"] = databases["default"].copy()
databases["default"] = {
"HOST": os.getenv("PAPERLESS_DBHOST"),
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
"USER": os.getenv("PAPERLESS_DBUSER", "paperless"),
"PASSWORD": os.getenv("PAPERLESS_DBPASS", "paperless"),
"OPTIONS": {},
}
if os.getenv("PAPERLESS_DBPORT"):
databases["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT")
if os.getenv("PAPERLESS_DBENGINE") == "mariadb":
engine = "django.db.backends.mysql"
options = {
"read_default_file": "/etc/mysql/my.cnf",
"charset": "utf8mb4",
"ssl_mode": os.getenv("PAPERLESS_DBSSLMODE", "PREFERRED"),
"ssl": {
"ca": os.getenv("PAPERLESS_DBSSLROOTCERT"),
"cert": os.getenv("PAPERLESS_DBSSLCERT"),
"key": os.getenv("PAPERLESS_DBSSLKEY"),
},
}
else:
engine = "django.db.backends.postgresql"
options = {
"sslmode": os.getenv("PAPERLESS_DBSSLMODE", "prefer"),
"sslrootcert": os.getenv("PAPERLESS_DBSSLROOTCERT"),
"sslcert": os.getenv("PAPERLESS_DBSSLCERT"),
"sslkey": os.getenv("PAPERLESS_DBSSLKEY"),
}
databases["default"]["ENGINE"] = engine
databases["default"]["OPTIONS"].update(options)
if os.getenv("PAPERLESS_DB_TIMEOUT") is not None:
timeout = int(os.getenv("PAPERLESS_DB_TIMEOUT"))
if databases["default"]["ENGINE"] == "django.db.backends.sqlite3":
databases["default"]["OPTIONS"].update({"timeout": timeout})
else:
databases["default"]["OPTIONS"].update({"connect_timeout": timeout})
databases["sqlite"]["OPTIONS"].update({"timeout": timeout})
return databases
DATABASES = _parse_db_settings()
SECRET_KEY = os.getenv("PAPERLESS_SECRET_KEY")
AUTH_PASSWORD_VALIDATORS = [
{
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
},
{
"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
},
{
"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
},
{
"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
},
]
LANGUAGE_CODE = "en-us"
TIME_ZONE = "UTC"
USE_I18N = True
USE_TZ = True
CSRF_TRUSTED_ORIGINS: list[str] = []
INSTALLED_APPS = [
"django.contrib.auth",
"django.contrib.contenttypes",
"django.contrib.sessions",
"django.contrib.messages",
"django.contrib.staticfiles",
"channels",
"allauth",
"allauth.account",
"allauth.socialaccount",
"allauth.mfa",
"paperless_migration",
]
MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"django.middleware.clickjacking.XFrameOptionsMiddleware",
"allauth.account.middleware.AccountMiddleware",
]
ROOT_URLCONF = "paperless_migration.urls"
TEMPLATES = [
{
"BACKEND": "django.template.backends.django.DjangoTemplates",
"DIRS": [
BASE_DIR / "paperless_migration" / "templates",
BASE_DIR / "documents" / "templates",
],
"APP_DIRS": True,
"OPTIONS": {
"context_processors": [
"django.template.context_processors.request",
"django.contrib.auth.context_processors.auth",
"django.contrib.messages.context_processors.messages",
],
},
},
]
# ASGI application for Channels
ASGI_APPLICATION = "paperless_migration.asgi.application"
# Channel layers configuration using Redis
REDIS_URL = _parse_redis_url()
CHANNEL_LAYERS = {
"default": {
"BACKEND": "channels_redis.core.RedisChannelLayer",
"CONFIG": {
"hosts": [REDIS_URL],
"capacity": 1500,
"expiry": 10,
},
},
}
# Keep WSGI for compatibility
WSGI_APPLICATION = "paperless_migration.wsgi.application"
AUTHENTICATION_BACKENDS = [
"django.contrib.auth.backends.ModelBackend",
"allauth.account.auth_backends.AuthenticationBackend",
]
STATIC_URL = "/static/"
STATICFILES_DIRS = [
BASE_DIR / ".." / "static",
BASE_DIR / "static",
BASE_DIR / "documents" / "static",
]
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
LOGIN_URL = "/accounts/login/"
LOGIN_REDIRECT_URL = "/migration/"
LOGOUT_REDIRECT_URL = "/accounts/login/?loggedout=1"
ACCOUNT_ADAPTER = "allauth.account.adapter.DefaultAccountAdapter"
ACCOUNT_AUTHENTICATED_LOGIN_REDIRECTS = False
SOCIALACCOUNT_ADAPTER = "allauth.socialaccount.adapter.DefaultSocialAccountAdapter"
SOCIALACCOUNT_ENABLED = False
SESSION_ENGINE = "django.contrib.sessions.backends.db"
MIGRATION_EXPORT_PATH = __get_path(
"PAPERLESS_MIGRATION_EXPORT_PATH",
EXPORT_DIR / "manifest.json",
)
MIGRATION_TRANSFORMED_PATH = __get_path(
"PAPERLESS_MIGRATION_TRANSFORMED_PATH",
EXPORT_DIR / "manifest.v3.json",
)
MIGRATION_IMPORTED_PATH = Path(EXPORT_DIR / "import.completed").resolve()
# Progress update frequency (rows between WebSocket updates)
MIGRATION_PROGRESS_FREQUENCY = int(
os.getenv("PAPERLESS_MIGRATION_PROGRESS_FREQUENCY", "100"),
)
# One-time access code required for migration logins; stable across autoreload
_code = os.getenv("PAPERLESS_MIGRATION_ACCESS_CODE")
if not _code:
import secrets
_code = secrets.token_urlsafe(12)
os.environ["PAPERLESS_MIGRATION_ACCESS_CODE"] = _code
MIGRATION_ACCESS_CODE = _code
if os.environ.get("PAPERLESS_MIGRATION_CODE_LOGGED") != "1":
logging.getLogger(__name__).warning(
"Migration one-time access code: %s",
MIGRATION_ACCESS_CODE,
)
os.environ["PAPERLESS_MIGRATION_CODE_LOGGED"] = "1"

View File

@@ -0,0 +1,77 @@
{% load i18n static %}
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="author" content="Paperless-ngx project and contributors">
<meta name="robots" content="noindex,nofollow">
<meta name="color-scheme" content="light">
<title>{% translate "Paperless-ngx sign in" %}</title>
<link href="{% static 'bootstrap.min.css' %}" rel="stylesheet">
<link href="{% static 'base.css' %}" rel="stylesheet">
<style>
:root, body, .form-control, .form-floating {
color-scheme: light;
--bs-body-bg: #f5f5f5;
--bs-body-color: #212529;
--bs-body-color-rgb: 33, 37, 41;
--bs-border-color: #dee2e6;
--bs-link-color: #17541f;
--bs-link-color-rgb: 23, 84, 31;
}
@media (prefers-color-scheme: dark) { :root { color-scheme: light; } }
body {
min-height: 100vh;
background:
radial-gradient(circle at 20% 20%, #eef5ef, #f7fbf7),
linear-gradient(120deg, rgba(23, 84, 31, 0.05) 0%, rgba(0,0,0,0) 30%),
linear-gradient(300deg, rgba(15, 54, 20, 0.06) 0%, rgba(0,0,0,0) 40%);
}
</style>
</head>
<body class="d-flex align-items-center justify-content-center text-center p-3">
<main class="w-100" style="max-width: 360px;">
<form class="form-accounts p-4 rounded-4" id="form-account" method="post">
{% csrf_token %}
{% include "paperless-ngx/snippets/svg_logo.html" with extra_attrs="width='240' class='logo mb-3'" %}
<p class="text-uppercase fw-semibold mb-1 text-secondary small" style="letter-spacing: 0.12rem;">{% translate "Migration Mode" %}</p>
{% for message in messages %}
<div class="alert alert-{{ message.level_tag }} mb-2" role="alert">{{ message }}</div>
{% endfor %}
<p class="mb-3">{% translate "Login with a superuser account to proceed." %}</p>
{% if form.errors %}
<div class="alert alert-danger" role="alert">
{% for field, errors in form.errors.items %}
{% for error in errors %}
{{ error }}
{% endfor %}
{% endfor %}
</div>
{% endif %}
{% translate "Username" as i18n_username %}
{% translate "Password" as i18n_password %}
<div class="form-floating form-stacked-top">
<input type="text" name="login" id="inputUsername" placeholder="{{ i18n_username }}" class="form-control" autocorrect="off" autocapitalize="none" required autofocus>
<label for="inputUsername">{{ i18n_username }}</label>
</div>
<div class="form-floating form-stacked-middle">
<input type="password" name="password" id="inputPassword" placeholder="{{ i18n_password }}" class="form-control" required>
<label for="inputPassword">{{ i18n_password }}</label>
</div>
<div class="form-floating form-stacked-bottom">
<input type="text" name="code" id="inputCode" placeholder="One-time code" class="form-control" required>
<label for="inputCode">One-time code</label>
</div>
<p class="mt-2 small fst-italic">{% translate "Code can be found in the startup logs." %}</p>
<div class="d-grid mt-3">
<button class="btn btn-lg btn-primary" type="submit">{% translate "Sign in" %}</button>
</div>
</form>
</main>
</body>
</html>

View File

@@ -0,0 +1,558 @@
<!doctype html>
{% load static %}
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Paperless-ngx Migration Mode</title>
<link rel="stylesheet" href="{% static 'bootstrap.min.css' %}" />
<link rel="stylesheet" href="{% static 'base.css' %}" />
<style>
:root, .form-control {
color-scheme: light;
--bs-body-bg: #f5f5f5;
--bs-body-color: #212529;
--bs-body-color-rgb: 33, 37, 41;
--bs-border-color: #dee2e6;
--bs-link-color: var(--pngx-primary);
--bs-link-color-rgb: 23, 84, 31;
}
@media (prefers-color-scheme: dark) { :root { color-scheme: light; } }
.btn-primary:disabled {
--bs-btn-disabled-bg: #4d7352;
--bs-btn-disabled-border-color: #4d7352;
}
body {
background:
radial-gradient(circle at 20% 20%, #eef5ef, #f7fbf7),
linear-gradient(120deg, rgba(23, 84, 31, 0.05) 0%, rgba(0,0,0,0) 30%),
linear-gradient(300deg, rgba(15, 54, 20, 0.06) 0%, rgba(0,0,0,0) 40%);
min-height: 100vh;
}
svg.logo .text {
fill: #161616 !important;
}
.hero-card,
.card-step {
background: #fff;
backdrop-filter: blur(6px);
border: 1px solid rgba(23, 84, 31, 0.08);
box-shadow: 0 16px 40px rgba(0, 0, 0, 0.06);
border-radius: 18px;
}
.status-dot {
width: 10px;
height: 10px;
border-radius: 50%;
display: inline-block;
}
.card-step {
border-radius: 16px;
transition: transform 0.15s ease, box-shadow 0.15s ease;
}
.card-step.done-step {
opacity: 0.4;
}
.path-pill {
background: rgba(23, 84, 31, 0.08);
color: var(--bs-body-color);
border-radius: 12px;
padding: 0.4rem 0.75rem;
font-size: 0.9rem;
}
.step-rail {
position: relative;
height: 4px;
background: rgba(23, 84, 31, 0.12);
border-radius: 999px;
}
.step-rail .fill {
position: absolute;
left: 0;
top: 0;
bottom: 0;
width: calc({{ export_exists|yesno:'33,0' }}% + {{ transformed_exists|yesno:'33,0' }}% + {{ imported_exists|yesno:'34,0' }}%);
max-width: 100%;
background: linear-gradient(90deg, #17541f, #2c7a3c);
border-radius: 999px;
transition: width 0.3s ease;
}
.step-chip {
width: 38px;
height: 38px;
border-radius: 50%;
display: grid;
place-items: center;
font-weight: 700;
background: #fff;
border: 2px solid rgba(23, 84, 31, 0.25);
color: #17541f;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
}
.step-chip.done {
background: #17541f;
color: #fff;
border-color: #17541f;
}
.console-log {
background: #0f1a12;
color: #d1e7d6;
border-radius: 12px;
min-height: 180px;
max-height: 400px;
padding: 12px;
font-size: 0.85rem;
font-family: 'Consolas', 'Monaco', monospace;
overflow: auto;
white-space: pre-wrap;
word-break: break-word;
}
.console-log .log-error { color: #ff6b6b; }
.console-log .log-warning { color: #ffd93d; }
.console-log .log-success { color: #6bcb77; }
.console-log .log-info { color: #4d96ff; }
.progress-bar-container {
height: 24px;
background: rgba(23, 84, 31, 0.1);
border-radius: 12px;
overflow: hidden;
margin-bottom: 0.5rem;
}
.progress-bar-fill {
height: 100%;
background: linear-gradient(90deg, #17541f, #2c7a3c);
border-radius: 12px;
transition: width 0.3s ease;
display: flex;
align-items: center;
justify-content: center;
color: white;
font-size: 0.75rem;
font-weight: 600;
min-width: fit-content;
padding: 0 8px;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
gap: 0.5rem;
margin-top: 0.5rem;
}
.stat-item {
background: rgba(23, 84, 31, 0.05);
border-radius: 8px;
padding: 0.5rem;
text-align: center;
}
.stat-value {
font-size: 1.25rem;
font-weight: 700;
color: #17541f;
}
.stat-label {
font-size: 0.75rem;
color: #666;
}
.ws-status {
display: inline-flex;
align-items: center;
gap: 0.5rem;
padding: 0.25rem 0.75rem;
border-radius: 999px;
font-size: 0.8rem;
font-weight: 500;
}
.ws-status.connected { background: #d4edda; color: #155724; }
.ws-status.disconnected { background: #f8d7da; color: #721c24; }
.ws-status.connecting { background: #fff3cd; color: #856404; }
</style>
</head>
<body class="pb-4">
<div class="container py-4">
<div class="row justify-content-center mb-4">
<div class="col-lg-9">
<div class="hero-card p-4">
<div class="d-flex flex-wrap align-items-center justify-content-between gap-3">
<div class="d-flex align-items-center gap-3">
{% include "paperless-ngx/snippets/svg_logo.html" with extra_attrs="width='280' class='logo'" %}
<div class="ps-2">
<p class="text-uppercase fw-semibold mb-1 text-secondary" style="letter-spacing: 0.12rem;">Migration Mode</p>
<h1 class="h3 mb-2 text-primary">Paperless-ngx v2 to v3</h1>
<p class="text-muted mb-0">Migrate your data from Paperless-ngx version 2 to version 3.</p>
</div>
</div>
<div class="text-end">
<span class="badge bg-success-subtle text-success border border-success-subtle px-3 py-2">Online</span>
</div>
</div>
<div class="mt-4">
<div class="d-flex justify-content-between align-items-center mb-2">
<div class="d-flex align-items-center gap-2">
<span class="step-chip {% if export_exists %}done{% endif %}">1</span>
<div>
<div class="fw-semibold mb-0">Export</div>
<small class="text-muted">v2 data</small>
</div>
</div>
<div class="d-flex align-items-center gap-2">
<span class="step-chip {% if transformed_exists %}done{% endif %}">2</span>
<div>
<div class="fw-semibold mb-0">Transform</div>
<small class="text-muted">to v3 schema</small>
</div>
</div>
<div class="d-flex align-items-center gap-2">
<span class="step-chip {% if imported_exists %}done{% endif %}">3</span>
<div>
<div class="fw-semibold mb-0">Import</div>
<small class="text-muted">into v3</small>
</div>
</div>
</div>
<div class="step-rail">
<div class="fill"></div>
</div>
</div>
{% if messages %}
<div class="mt-4">
{% for message in messages %}
<div class="alert alert-{{ message.level_tag }} mb-2" role="alert">{{ message }}</div>
{% endfor %}
</div>
{% endif %}
<div class="row g-3 mt-2">
<div class="col-md-6">
<div class="d-flex align-items-center gap-2">
<span class="status-dot bg-{{ export_exists|yesno:'success,danger' }}"></span>
<div>
<div class="fw-semibold">Export file</div>
<div class="small text-muted">{{ export_exists|yesno:"Ready,Missing" }}</div>
</div>
</div>
<div class="path-pill mt-2 text-truncate" title="{{ export_path }}">{{ export_path }}</div>
</div>
<div class="col-md-6">
<div class="d-flex align-items-center gap-2">
<span class="status-dot bg-{{ transformed_exists|yesno:'success,warning' }}"></span>
<div>
<div class="fw-semibold">Transformed file</div>
<div class="small text-muted">{{ transformed_exists|yesno:"Ready,Pending" }}</div>
</div>
</div>
<div class="path-pill mt-2 text-truncate" title="{{ transformed_path }}">{{ transformed_path }}</div>
</div>
</div>
</div>
</div>
</div>
<div class="row gy-4 justify-content-center">
<div class="col-lg-3 col-md-4">
<div class="card card-step h-100 {% if export_exists %}done-step{% endif %}">
<div class="card-body d-flex flex-column gap-3">
<div>
<p class="text-uppercase text-muted mb-1 fw-semibold" style="letter-spacing: 0.08rem;">Step 1</p>
<h3 class="h5 mb-1">Export (v2)</h3>
<p class="small text-muted mb-0">Generate and upload the v2 export file.</p>
</div>
<div class="mt-auto d-grid gap-2">
<form method="post" enctype="multipart/form-data" class="d-flex gap-2 align-items-center">
{% csrf_token %}
<input class="form-control form-control-sm" type="file" name="export_file" accept=".json" {% if export_exists %}disabled{% endif %} required>
<button class="btn btn-outline-secondary btn-sm" type="submit" name="action" value="upload" {% if export_exists %}disabled aria-disabled="true"{% endif %}>Upload</button>
</form>
<form method="post">
{% csrf_token %}
<button class="btn btn-primary w-100" type="submit" name="action" value="check" {% if export_exists %}disabled aria-disabled="true"{% endif %}>Re-check export</button>
</form>
</div>
</div>
</div>
</div>
<div class="col-lg-3 col-md-4">
<div class="card card-step h-100 {% if transformed_exists %}done-step{% endif %}">
<div class="card-body d-flex flex-column gap-3">
<div>
<p class="text-uppercase text-muted mb-1 fw-semibold" style="letter-spacing: 0.08rem;">Step 2</p>
<h3 class="h5 mb-1">Transform</h3>
<p class="small text-muted mb-0">Convert the export into the v3-ready structure.</p>
</div>
<div class="mt-auto d-grid gap-2">
<form method="post">
{% csrf_token %}
<button
class="btn btn-outline-primary w-100"
type="submit"
name="action"
value="transform"
id="btn-transform"
{% if not export_exists or transformed_exists %}disabled aria-disabled="true"{% endif %}
>
Transform export
</button>
</form>
{% if transformed_exists %}
<form method="post">
{% csrf_token %}
<button class="btn btn-outline-danger btn-sm w-100" type="submit" name="action" value="reset_transform">
Reset transform
</button>
</form>
{% endif %}
</div>
</div>
</div>
</div>
<div class="col-lg-3 col-md-4">
<div class="card card-step h-100 {% if imported_exists %}done-step{% endif %}">
<div class="card-body d-flex flex-column gap-3">
<div>
<p class="text-uppercase text-muted mb-1 fw-semibold" style="letter-spacing: 0.08rem;">Step 3</p>
<h3 class="h5 mb-1">Import (v3)</h3>
<p class="small text-muted mb-0">Load the transformed data into your v3 instance.</p>
</div>
<div class="mt-auto">
<form method="post">
{% csrf_token %}
<button
class="btn btn-outline-secondary w-100"
type="submit"
name="action"
value="import"
id="btn-import"
{% if not transformed_exists or imported_exists %}disabled aria-disabled="true"{% endif %}
>
Import transformed data
</button>
</form>
</div>
</div>
</div>
</div>
</div>
<div class="row justify-content-center mt-4">
<div class="col-lg-9">
{% if not export_exists %}
<div class="alert alert-info mb-3">
<div class="fw-semibold mb-1">Export file not found</div>
<div class="small">
Run the v2 export from your Paperless instance, e.g.:
<code>docker run --rm ghcr.io/paperless-ngx/paperless-ngx:2.20.6 document_exporter --data-only</code>
(see <a href="https://docs.paperless-ngx.com/administration/#exporter" target="_blank" rel="noopener noreferrer">documentation</a>). Once the <code>manifest.json</code> is in-place, upload it or (especially for larger files) place it directly at the expected location and click "Re-check export".
<p class="mt-2 mb-0 text-danger fst-italic">Warning: The export must be generated with version Paperless-ngx v2.20.6</p>
</div>
</div>
{% endif %}
<div class="card card-step">
<div class="card-body">
<div class="d-flex justify-content-between align-items-center mb-2">
<div class="fw-semibold">Migration console</div>
<span id="ws-status" class="ws-status disconnected">
<span class="status-dot"></span>
<span class="status-text">Ready</span>
</span>
</div>
<div id="progress-container" class="mb-3" style="display: none;">
<div class="progress-bar-container">
<div id="progress-bar" class="progress-bar-fill" style="width: 0%;">
<span id="progress-text">0 rows</span>
</div>
</div>
<div id="stats-container" class="stats-grid"></div>
</div>
<div id="migration-log" class="console-log">Ready to begin migration...</div>
</div>
</div>
</div>
</div>
</div>
<script>
(function() {
const logEl = document.getElementById('migration-log');
const wsStatusEl = document.getElementById('ws-status');
const progressContainer = document.getElementById('progress-container');
const progressBar = document.getElementById('progress-bar');
const progressText = document.getElementById('progress-text');
const statsContainer = document.getElementById('stats-container');
function setWsStatus(status, text) {
wsStatusEl.className = 'ws-status ' + status;
wsStatusEl.querySelector('.status-text').textContent = text;
}
function appendLog(message, level) {
const line = document.createElement('div');
line.className = 'log-' + (level || 'info');
line.textContent = message;
logEl.appendChild(line);
logEl.scrollTop = logEl.scrollHeight;
}
function clearLog() {
logEl.innerHTML = '';
}
function updateProgress(current, total, label) {
progressContainer.style.display = 'block';
const pct = total ? Math.min(100, (current / total) * 100) : 0;
progressBar.style.width = (total ? pct : 100) + '%';
progressText.textContent = label || (current.toLocaleString() + ' rows');
}
function updateStats(stats) {
if (!stats || Object.keys(stats).length === 0) {
statsContainer.innerHTML = '';
return;
}
let html = '';
for (const [key, value] of Object.entries(stats)) {
const label = key.replace('documents.', '').replace('_', ' ');
html += '<div class="stat-item">' +
'<div class="stat-value">' + (typeof value === 'number' ? value.toLocaleString() : value) + '</div>' +
'<div class="stat-label">' + label + '</div>' +
'</div>';
}
statsContainer.innerHTML = html;
}
function formatDuration(seconds) {
if (seconds < 60) return seconds.toFixed(1) + 's';
const mins = Math.floor(seconds / 60);
const secs = (seconds % 60).toFixed(0);
return mins + 'm ' + secs + 's';
}
function startWebSocket(action) {
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsUrl = protocol + '//' + window.location.host + '/ws/migration/' + action + '/';
clearLog();
appendLog('Connecting to ' + action + ' service...', 'info');
setWsStatus('connecting', 'Connecting...');
progressContainer.style.display = 'none';
statsContainer.innerHTML = '';
const ws = new WebSocket(wsUrl);
ws.onopen = function() {
setWsStatus('connected', 'Connected');
appendLog('Connected. Starting ' + action + '...', 'success');
ws.send(JSON.stringify({ action: 'start' }));
};
ws.onmessage = function(event) {
try {
const data = JSON.parse(event.data);
switch (data.type) {
case 'log':
appendLog(data.message, data.level || 'info');
break;
case 'progress':
updateProgress(data.current, data.total, data.label);
break;
case 'stats':
if (data.transformed) {
updateStats(data.transformed);
} else {
updateStats(data);
}
break;
case 'complete':
const status = data.success ? 'success' : 'error';
const msg = data.success
? 'Completed successfully in ' + formatDuration(data.duration)
: 'Operation failed';
appendLog(msg, status);
if (data.total_processed) {
appendLog('Total processed: ' + data.total_processed.toLocaleString() + ' rows', 'info');
}
if (data.speed) {
appendLog('Speed: ' + Math.round(data.speed).toLocaleString() + ' rows/sec', 'info');
}
if (data.stats) {
updateStats(data.stats);
}
setWsStatus('disconnected', 'Complete');
ws.close();
if (data.success) {
setTimeout(function() { window.location.reload(); }, 1500);
}
break;
case 'error':
appendLog('Error: ' + data.message, 'error');
setWsStatus('disconnected', 'Error');
break;
default:
appendLog(JSON.stringify(data), 'info');
}
} catch (e) {
appendLog('Received: ' + event.data, 'info');
}
};
ws.onerror = function(error) {
appendLog('WebSocket error occurred', 'error');
setWsStatus('disconnected', 'Error');
};
ws.onclose = function(event) {
if (event.code !== 1000) {
const reason = event.code === 4001 ? 'Not authenticated'
: event.code === 4002 ? 'Migration code not verified'
: event.code === 4003 ? 'Superuser access required'
: 'Connection closed (code: ' + event.code + ')';
appendLog(reason, 'error');
}
setWsStatus('disconnected', 'Disconnected');
};
}
// Check if we should auto-start a WebSocket action
{% if ws_action %}
startWebSocket('{{ ws_action }}');
{% endif %}
// Expose for manual triggering if needed
window.startMigrationWs = startWebSocket;
})();
</script>
</body>
</html>

View File

@@ -0,0 +1,21 @@
"""URL configuration for migration mode."""
from __future__ import annotations
from django.conf import settings
from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.urls import include
from django.urls import path
from paperless_migration import views
urlpatterns = [
path("accounts/login/", views.migration_login, name="account_login"),
path("accounts/", include("allauth.urls")),
path("migration/", views.migration_home, name="migration_home"),
# Redirect root to migration home
path("", views.migration_home, name="home"),
]
if settings.DEBUG:
urlpatterns += staticfiles_urlpatterns()

View File

@@ -0,0 +1,132 @@
"""Views for migration mode web interface."""
from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING
from django.conf import settings
from django.contrib import messages
from django.contrib.auth import authenticate
from django.contrib.auth import login
from django.contrib.auth.decorators import login_required
from django.http import HttpResponseForbidden
from django.shortcuts import redirect
from django.shortcuts import render
from django.views.decorators.http import require_http_methods
if TYPE_CHECKING:
from django.http import HttpRequest
from django.http import HttpResponse
def _check_migration_access(request: HttpRequest) -> HttpResponse | None:
"""Check if user has migration access. Returns error response or None."""
if not request.session.get("migration_code_ok"):
return HttpResponseForbidden("Access code required")
if not request.user.is_superuser:
return HttpResponseForbidden("Superuser access required")
return None
@login_required
@require_http_methods(["GET", "POST"])
def migration_home(request: HttpRequest) -> HttpResponse:
"""Main migration dashboard view."""
error_response = _check_migration_access(request)
if error_response:
return error_response
export_path = Path(settings.MIGRATION_EXPORT_PATH)
transformed_path = Path(settings.MIGRATION_TRANSFORMED_PATH)
imported_marker = Path(settings.MIGRATION_IMPORTED_PATH)
if request.method == "POST":
action = request.POST.get("action")
if action == "check":
messages.success(request, "Checked export paths.")
elif action == "upload":
upload = request.FILES.get("export_file")
if not upload:
messages.error(request, "No file selected.")
else:
try:
export_path.parent.mkdir(parents=True, exist_ok=True)
with export_path.open("wb") as dest:
for chunk in upload.chunks():
dest.write(chunk)
messages.success(request, f"Uploaded to {export_path}.")
except Exception as exc:
messages.error(request, f"Failed to save file: {exc}")
elif action == "transform":
if imported_marker.exists():
imported_marker.unlink()
# Signal to start WebSocket connection for transform
request.session["start_ws_action"] = "transform"
messages.info(request, "Starting transform via WebSocket...")
elif action == "import":
# Signal to start WebSocket connection for import
request.session["start_ws_action"] = "import"
messages.info(request, "Starting import via WebSocket...")
elif action == "reset_transform":
if transformed_path.exists():
try:
transformed_path.unlink()
messages.success(request, "Transformed file deleted.")
except Exception as exc:
messages.error(request, f"Failed to delete transformed file: {exc}")
if imported_marker.exists():
try:
imported_marker.unlink()
except Exception:
pass
else:
messages.error(request, "Unknown action.")
return redirect("migration_home")
ws_action = request.session.pop("start_ws_action", None)
context = {
"export_path": export_path,
"export_exists": export_path.exists(),
"transformed_path": transformed_path,
"transformed_exists": transformed_path.exists(),
"imported_exists": imported_marker.exists(),
"ws_action": ws_action,
}
return render(request, "paperless_migration/migration_home.html", context)
@require_http_methods(["GET", "POST"])
def migration_login(request: HttpRequest) -> HttpResponse:
"""Migration-specific login view requiring access code."""
if request.method == "POST":
username = request.POST.get("login", "")
password = request.POST.get("password", "")
code = request.POST.get("code", "")
if not code or code != settings.MIGRATION_ACCESS_CODE:
messages.error(request, "One-time code is required.")
return redirect("account_login")
user = authenticate(request, username=username, password=password)
if user is None:
messages.error(request, "Invalid username or password.")
return redirect("account_login")
if not user.is_superuser:
messages.error(request, "Superuser access required.")
return redirect("account_login")
login(request, user)
request.session["migration_code_ok"] = True
return redirect(settings.LOGIN_REDIRECT_URL)
return render(request, "account/login.html")

View File

@@ -0,0 +1,7 @@
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless_migration.settings")
application = get_wsgi_application()

89
uv.lock generated
View File

@@ -1305,7 +1305,7 @@ name = "exceptiongroup"
version = "1.3.1" version = "1.3.1"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "typing-extensions", marker = "(python_full_version < '3.13' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or (python_full_version < '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.12' and platform_machine == 'x86_64' and sys_platform == 'linux') or (python_full_version < '3.13' and sys_platform == 'darwin')" }, { name = "typing-extensions", marker = "(python_full_version < '3.11' and sys_platform == 'darwin') or (python_full_version < '3.11' and sys_platform == 'linux')" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
wheels = [ wheels = [
@@ -1933,6 +1933,82 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" },
] ]
[[package]]
name = "ijson"
version = "3.4.0.post0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/2d/30/7ab4b9e88e7946f6beef419f74edcc541df3ea562c7882257b4eaa82417d/ijson-3.4.0.post0.tar.gz", hash = "sha256:9aa02dc70bb245670a6ca7fba737b992aeeb4895360980622f7e568dbf23e41e", size = 67216, upload-time = "2025-10-10T05:29:25.62Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b5/15/4f4921ed9ab94032fd0b03ecb211ff9dbd5cc9953463f5b5c4ddeab406fc/ijson-3.4.0.post0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8f904a405b58a04b6ef0425f1babbc5c65feb66b0a4cc7f214d4ad7de106f77d", size = 88244, upload-time = "2025-10-10T05:27:42.001Z" },
{ url = "https://files.pythonhosted.org/packages/af/d6/b85d4da1752362a789bc3e0fc4b55e812a374a50d2fe1c06cab2e2bcb170/ijson-3.4.0.post0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a07dcc1a8a1ddd76131a7c7528cbd12951c2e34eb3c3d63697b905069a2d65b1", size = 59880, upload-time = "2025-10-10T05:27:44.791Z" },
{ url = "https://files.pythonhosted.org/packages/c3/96/e1027e6d0efb5b9192bdc9f0af5633c20a56999cce4cf7ad35427f823138/ijson-3.4.0.post0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ab3be841b8c430c1883b8c0775eb551f21b5500c102c7ee828afa35ddd701bdd", size = 59939, upload-time = "2025-10-10T05:27:45.66Z" },
{ url = "https://files.pythonhosted.org/packages/e3/71/b9ca0a19afb2f36be35c6afa2c4d1c19950dc45f6a50b483b56082b3e165/ijson-3.4.0.post0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:43059ae0d657b11c5ddb11d149bc400c44f9e514fb8663057e9b2ea4d8d44c1f", size = 125894, upload-time = "2025-10-10T05:27:46.551Z" },
{ url = "https://files.pythonhosted.org/packages/02/1b/f7356de078d85564829c5e2a2a31473ee0ad1876258ceecf550b582e57b7/ijson-3.4.0.post0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0d3e82963096579d1385c06b2559570d7191e225664b7fa049617da838e1a4a4", size = 132385, upload-time = "2025-10-10T05:27:48Z" },
{ url = "https://files.pythonhosted.org/packages/57/7b/08f86eed5df0849b673260dd2943b6a7367a55b5a4b6e73ddbfbdf4206f1/ijson-3.4.0.post0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:461ce4e87a21a261b60c0a68a2ad17c7dd214f0b90a0bec7e559a66b6ae3bd7e", size = 129567, upload-time = "2025-10-10T05:27:49.188Z" },
{ url = "https://files.pythonhosted.org/packages/96/e1/69672d95b1a16e7c6bf89cef6c892b228cc84b484945a731786a425700d2/ijson-3.4.0.post0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:890cf6610c9554efcb9765a93e368efeb5bb6135f59ce0828d92eaefff07fde5", size = 132821, upload-time = "2025-10-10T05:27:50.342Z" },
{ url = "https://files.pythonhosted.org/packages/0b/15/9ed4868e2e92db2454508f7ea1282bec0b039bd344ac0cbac4a2de16786d/ijson-3.4.0.post0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:6793c29a5728e7751a7df01be58ba7da9b9690c12bf79d32094c70a908fa02b9", size = 127757, upload-time = "2025-10-10T05:27:51.203Z" },
{ url = "https://files.pythonhosted.org/packages/5b/aa/08a308d3aaa6e98511f3100f8a1e4e8ff8c853fa4ec3f18b71094ac36bbe/ijson-3.4.0.post0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a56b6674d7feec0401c91f86c376f4e3d8ff8129128a8ad21ca43ec0b1242f79", size = 130439, upload-time = "2025-10-10T05:27:52.123Z" },
{ url = "https://files.pythonhosted.org/packages/a7/ac/3d57249d4acba66a33eaef794edb5b2a2222ca449ae08800f8abe9286645/ijson-3.4.0.post0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0b473112e72c0c506da425da3278367b6680f340ecc093084693a1e819d28435", size = 88278, upload-time = "2025-10-10T05:27:55.403Z" },
{ url = "https://files.pythonhosted.org/packages/12/fb/2d068d23d1a665f500282ceb6f2473952a95fc7107d739fd629b4ab41959/ijson-3.4.0.post0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:043f9b7cf9cc744263a78175e769947733710d2412d25180df44b1086b23ebd5", size = 59898, upload-time = "2025-10-10T05:27:56.361Z" },
{ url = "https://files.pythonhosted.org/packages/26/3d/8b14589dfb0e5dbb7bcf9063e53d3617c041cf315ff3dfa60945382237ce/ijson-3.4.0.post0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b55e49045f4c8031f3673f56662fd828dc9e8d65bd3b03a9420dda0d370e64ba", size = 59945, upload-time = "2025-10-10T05:27:57.581Z" },
{ url = "https://files.pythonhosted.org/packages/77/57/086a75094397d4b7584698a540a279689e12905271af78cdfc903bf9eaf8/ijson-3.4.0.post0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:11f13b73194ea2a5a8b4a2863f25b0b4624311f10db3a75747b510c4958179b0", size = 131318, upload-time = "2025-10-10T05:27:58.453Z" },
{ url = "https://files.pythonhosted.org/packages/df/35/7f61e9ce4a9ff1306ec581eb851f8a660439126d92ee595c6dc8084aac97/ijson-3.4.0.post0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:659acb2843433e080c271ecedf7d19c71adde1ee5274fc7faa2fec0a793f9f1c", size = 137990, upload-time = "2025-10-10T05:27:59.328Z" },
{ url = "https://files.pythonhosted.org/packages/59/bf/590bbc3c3566adce5e2f43ba5894520cbaf19a3e7f38c1250926ba67eee4/ijson-3.4.0.post0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:deda4cfcaafa72ca3fa845350045b1d0fef9364ec9f413241bb46988afbe6ee6", size = 134416, upload-time = "2025-10-10T05:28:00.317Z" },
{ url = "https://files.pythonhosted.org/packages/24/c1/fb719049851979df71f3e039d6f1a565d349c9cb1b29c0f8775d9db141b4/ijson-3.4.0.post0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47352563e8c594360bacee2e0753e97025f0861234722d02faace62b1b6d2b2a", size = 138034, upload-time = "2025-10-10T05:28:01.627Z" },
{ url = "https://files.pythonhosted.org/packages/10/ce/ccda891f572876aaf2c43f0b2079e31d5b476c3ae53196187eab1a788eff/ijson-3.4.0.post0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5a48b9486242d1295abe7fd0fbb6308867da5ca3f69b55c77922a93c2b6847aa", size = 132510, upload-time = "2025-10-10T05:28:03.141Z" },
{ url = "https://files.pythonhosted.org/packages/11/b5/ca8e64ab7cf5252f358e467be767630f085b5bbcd3c04333a3a5f36c3dd3/ijson-3.4.0.post0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9c0886234d1fae15cf4581a430bdba03d79251c1ab3b07e30aa31b13ef28d01c", size = 134907, upload-time = "2025-10-10T05:28:04.438Z" },
{ url = "https://files.pythonhosted.org/packages/7d/fe/3b6af0025288e769dbfa30485dae1b3bd3f33f00390f3ee532cbb1c33e9b/ijson-3.4.0.post0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b607a500fca26101be47d2baf7cddb457b819ab60a75ce51ed1092a40da8b2f9", size = 87847, upload-time = "2025-10-10T05:28:07.229Z" },
{ url = "https://files.pythonhosted.org/packages/6e/a5/95ee2ca82f3b1a57892452f6e5087607d56c620beb8ce625475194568698/ijson-3.4.0.post0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4827d9874a6a81625412c59f7ca979a84d01f7f6bfb3c6d4dc4c46d0382b14e0", size = 59815, upload-time = "2025-10-10T05:28:08.448Z" },
{ url = "https://files.pythonhosted.org/packages/51/8d/5a704ab3c17c55c21c86423458db8610626ca99cc9086a74dfeb7ee9054c/ijson-3.4.0.post0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d4d4afec780881edb2a0d2dd40b1cdbe246e630022d5192f266172a0307986a7", size = 59648, upload-time = "2025-10-10T05:28:09.307Z" },
{ url = "https://files.pythonhosted.org/packages/25/56/ca5d6ca145d007f30b44e747f3c163bc08710ce004af0deaad4a2301339b/ijson-3.4.0.post0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:432fb60ffb952926f9438e0539011e2dfcd108f8426ee826ccc6173308c3ff2c", size = 138279, upload-time = "2025-10-10T05:28:10.489Z" },
{ url = "https://files.pythonhosted.org/packages/c3/d3/22e3cc806fcdda7ad4c8482ed74db7a017d4a1d49b4300c7bc07052fb561/ijson-3.4.0.post0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54a0e3e05d9a0c95ecba73d9579f146cf6d5c5874116c849dba2d39a5f30380e", size = 149110, upload-time = "2025-10-10T05:28:12.263Z" },
{ url = "https://files.pythonhosted.org/packages/3e/04/efb30f413648b9267f5a33920ac124d7ebef3bc4063af8f6ffc8ca11ddcb/ijson-3.4.0.post0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05807edc0bcbd222dc6ea32a2b897f0c81dc7f12c8580148bc82f6d7f5e7ec7b", size = 149026, upload-time = "2025-10-10T05:28:13.557Z" },
{ url = "https://files.pythonhosted.org/packages/2d/cf/481165f7046ade32488719300a3994a437020bc41cfbb54334356348f513/ijson-3.4.0.post0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a5269af16f715855d9864937f9dd5c348ca1ac49cee6a2c7a1b7091c159e874f", size = 150012, upload-time = "2025-10-10T05:28:14.859Z" },
{ url = "https://files.pythonhosted.org/packages/0f/24/642e3289917ecf860386e26dfde775f9962d26ab7f6c2e364ed3ca3c25d8/ijson-3.4.0.post0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b200df83c901f5bfa416d069ac71077aa1608f854a4c50df1b84ced560e9c9ec", size = 142193, upload-time = "2025-10-10T05:28:16.131Z" },
{ url = "https://files.pythonhosted.org/packages/0f/f5/fd2f038abe95e553e1c3ee207cda19db9196eb416e63c7c89699a8cf0db7/ijson-3.4.0.post0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6458bd8e679cdff459a0a5e555b107c3bbacb1f382da3fe0f40e392871eb518d", size = 150904, upload-time = "2025-10-10T05:28:17.401Z" },
{ url = "https://files.pythonhosted.org/packages/1b/20/aaec6977f9d538bbadd760c7fa0f6a0937742abdcc920ec6478a8576e55f/ijson-3.4.0.post0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:114ed248166ac06377e87a245a158d6b98019d2bdd3bb93995718e0bd996154f", size = 87863, upload-time = "2025-10-10T05:28:20.786Z" },
{ url = "https://files.pythonhosted.org/packages/5b/29/06bf56a866e2fe21453a1ad8f3a5d7bca3c723f73d96329656dfee969783/ijson-3.4.0.post0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ffb21203736b08fe27cb30df6a4f802fafb9ef7646c5ff7ef79569b63ea76c57", size = 59806, upload-time = "2025-10-10T05:28:21.596Z" },
{ url = "https://files.pythonhosted.org/packages/ba/ae/e1d0fda91ba7a444b75f0d60cb845fdb1f55d3111351529dcbf4b1c276fe/ijson-3.4.0.post0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:07f20ecd748602ac7f18c617637e53bd73ded7f3b22260bba3abe401a7fc284e", size = 59643, upload-time = "2025-10-10T05:28:22.45Z" },
{ url = "https://files.pythonhosted.org/packages/4d/24/5a24533be2726396cc1724dc237bada09b19715b5bfb0e7b9400db0901ad/ijson-3.4.0.post0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:27aa193d47ffc6bc4e45453896ad98fb089a367e8283b973f1fe5c0198b60b4e", size = 138082, upload-time = "2025-10-10T05:28:23.319Z" },
{ url = "https://files.pythonhosted.org/packages/05/60/026c3efcec23c329657e878cbc0a9a25b42e7eb3971e8c2377cb3284e2b7/ijson-3.4.0.post0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ccddb2894eb7af162ba43b9475ac5825d15d568832f82eb8783036e5d2aebd42", size = 149145, upload-time = "2025-10-10T05:28:24.279Z" },
{ url = "https://files.pythonhosted.org/packages/ed/c2/036499909b7a1bc0bcd85305e4348ad171aeb9df57581287533bdb3497e9/ijson-3.4.0.post0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61ab0b8c5bf707201dc67e02c116f4b6545c4afd7feb2264b989d242d9c4348a", size = 149046, upload-time = "2025-10-10T05:28:25.186Z" },
{ url = "https://files.pythonhosted.org/packages/ba/75/e7736073ad96867c129f9e799e3e65086badd89dbf3911f76d9b3bf8a115/ijson-3.4.0.post0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:254cfb8c124af68327a0e7a49b50bbdacafd87c4690a3d62c96eb01020a685ef", size = 150356, upload-time = "2025-10-10T05:28:26.135Z" },
{ url = "https://files.pythonhosted.org/packages/9d/1b/1c1575d2cda136985561fcf774fe6c54412cd0fa08005342015af0403193/ijson-3.4.0.post0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:04ac9ca54db20f82aeda6379b5f4f6112fdb150d09ebce04affeab98a17b4ed3", size = 142322, upload-time = "2025-10-10T05:28:27.125Z" },
{ url = "https://files.pythonhosted.org/packages/28/4d/aba9871feb624df8494435d1a9ddc7b6a4f782c6044bfc0d770a4b59f145/ijson-3.4.0.post0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a603d7474bf35e7b3a8e49c8dabfc4751841931301adff3f3318171c4e407f32", size = 151386, upload-time = "2025-10-10T05:28:28.274Z" },
{ url = "https://files.pythonhosted.org/packages/c7/89/4344e176f2c5f5ef3251c9bfa4ddd5b4cf3f9601fd6ec3f677a3ba0b9c71/ijson-3.4.0.post0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:45a0b1c833ed2620eaf8da958f06ac8351c59e5e470e078400d23814670ed708", size = 92342, upload-time = "2025-10-10T05:28:31.389Z" },
{ url = "https://files.pythonhosted.org/packages/d4/b1/85012c586a6645f9fb8bfa3ef62ed2f303c8d73fc7c2f705111582925980/ijson-3.4.0.post0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7809ec8c8f40228edaaa089f33e811dff4c5b8509702652870d3f286c9682e27", size = 62028, upload-time = "2025-10-10T05:28:32.849Z" },
{ url = "https://files.pythonhosted.org/packages/65/ea/7b7e2815c101d78b33e74d64ddb70cccc377afccd5dda76e566ed3fcb56f/ijson-3.4.0.post0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cf4a34c2cfe852aee75c89c05b0a4531c49dc0be27eeed221afd6fbf9c3e149c", size = 61773, upload-time = "2025-10-10T05:28:34.016Z" },
{ url = "https://files.pythonhosted.org/packages/59/7d/2175e599cb77a64f528629bad3ce95dfdf2aa6171d313c1fc00bbfaf0d22/ijson-3.4.0.post0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:a39d5d36067604b26b78de70b8951c90e9272450642661fe531a8f7a6936a7fa", size = 198562, upload-time = "2025-10-10T05:28:34.878Z" },
{ url = "https://files.pythonhosted.org/packages/13/97/82247c501c92405bb2fc44ab5efb497335bcb9cf0f5d3a0b04a800737bd8/ijson-3.4.0.post0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83fc738d81c9ea686b452996110b8a6678296c481e0546857db24785bff8da92", size = 216212, upload-time = "2025-10-10T05:28:36.208Z" },
{ url = "https://files.pythonhosted.org/packages/95/ca/b956f507bb02e05ce109fd11ab6a2c054f8b686cc5affe41afe50630984d/ijson-3.4.0.post0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b2a81aee91633868f5b40280e2523f7c5392e920a5082f47c5e991e516b483f6", size = 206618, upload-time = "2025-10-10T05:28:37.243Z" },
{ url = "https://files.pythonhosted.org/packages/3e/12/e827840ab81d86a9882e499097934df53294f05155f1acfcb9a211ac1142/ijson-3.4.0.post0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:56169e298c5a2e7196aaa55da78ddc2415876a74fe6304f81b1eb0d3273346f7", size = 210689, upload-time = "2025-10-10T05:28:38.252Z" },
{ url = "https://files.pythonhosted.org/packages/1b/3b/59238d9422c31a4aefa22ebeb8e599e706158a0ab03669ef623be77a499a/ijson-3.4.0.post0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:eeb9540f0b1a575cbb5968166706946458f98c16e7accc6f2fe71efa29864241", size = 199927, upload-time = "2025-10-10T05:28:39.233Z" },
{ url = "https://files.pythonhosted.org/packages/b6/0f/ec01c36c128c37edb8a5ae8f3de3256009f886338d459210dfe121ee4ba9/ijson-3.4.0.post0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ba3478ff0bb49d7ba88783f491a99b6e3fa929c930ab062d2bb7837e6a38fe88", size = 204455, upload-time = "2025-10-10T05:28:40.644Z" },
{ url = "https://files.pythonhosted.org/packages/af/0b/a4ce8524fd850302bbf5d9f38d07c0fa981fdbe44951d2fcd036935b67dd/ijson-3.4.0.post0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:da6a21b88cbf5ecbc53371283988d22c9643aa71ae2873bbeaefd2dea3b6160b", size = 88361, upload-time = "2025-10-10T05:28:43.73Z" },
{ url = "https://files.pythonhosted.org/packages/be/90/a5e5f33e46f28174a9c8142d12dcb3d26ce358d9a2230b9b15f5c987b3a5/ijson-3.4.0.post0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cf24a48a1c3ca9d44a04feb59ccefeb9aa52bb49b9cb70ad30518c25cce74bb7", size = 59960, upload-time = "2025-10-10T05:28:44.585Z" },
{ url = "https://files.pythonhosted.org/packages/83/e2/551dd7037dda759aa0ce53f0d3d7be03b03c6b05c0b0a5d5ab7a47e6b4b1/ijson-3.4.0.post0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d14427d366f95f21adcb97d0ed1f6d30f6fdc04d0aa1e4de839152c50c2b8d65", size = 59957, upload-time = "2025-10-10T05:28:45.748Z" },
{ url = "https://files.pythonhosted.org/packages/ac/b9/3006384f85cc26cf83dbbd542d362cc336f1e1ddd491e32147cfa46ea8ae/ijson-3.4.0.post0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:339d49f6c5d24051c85d9226be96d2d56e633cb8b7d09dd8099de8d8b51a97e2", size = 139967, upload-time = "2025-10-10T05:28:47.229Z" },
{ url = "https://files.pythonhosted.org/packages/77/3b/b5234add8115cbfe8635b6c152fb527327f45e4c0f0bf2e93844b36b5217/ijson-3.4.0.post0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7206afcb396aaef66c2b066997b4e9d9042c4b7d777f4d994e9cec6d322c2fe6", size = 149196, upload-time = "2025-10-10T05:28:48.226Z" },
{ url = "https://files.pythonhosted.org/packages/a2/d2/c4ae543e37d7a9fba09740c221976a63705dbad23a9cda9022fc9fa0f3de/ijson-3.4.0.post0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c8dd327da225887194fe8b93f2b3c9c256353e14a6b9eefc940ed17fde38f5b8", size = 148516, upload-time = "2025-10-10T05:28:49.237Z" },
{ url = "https://files.pythonhosted.org/packages/0d/a1/914b5fb1c26af2474cd04841626e0e95576499a4ca940661fb105ee12dd2/ijson-3.4.0.post0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:4810546e66128af51fd4a0c9a640e84e8508e9c15c4f247d8a3e3253b20e1465", size = 149770, upload-time = "2025-10-10T05:28:50.501Z" },
{ url = "https://files.pythonhosted.org/packages/7a/c1/51c3584102d0d85d4aa10cc88dbbe431ecb9fe98160a9e2fad62a4456aed/ijson-3.4.0.post0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:103a0838061297d063bca81d724b0958b616f372bd893bbc278320152252c652", size = 143688, upload-time = "2025-10-10T05:28:51.823Z" },
{ url = "https://files.pythonhosted.org/packages/47/3d/a54f13d766332620bded8ee76bcdd274509ecc53cf99573450f95b3ad910/ijson-3.4.0.post0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:40007c977e230e04118b27322f25a72ae342a3d61464b2057fcd9b21eeb7427a", size = 150688, upload-time = "2025-10-10T05:28:52.757Z" },
{ url = "https://files.pythonhosted.org/packages/69/1c/8a199fded709e762aced89bb7086973c837e432dd714bbad78a6ac789c23/ijson-3.4.0.post0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:226447e40ca9340a39ed07d68ea02ee14b52cb4fe649425b256c1f0073531c83", size = 92345, upload-time = "2025-10-10T05:28:55.657Z" },
{ url = "https://files.pythonhosted.org/packages/be/60/04e97f6a403203bd2eb8849570bdce5719d696b5fb96aa2a62566fe7a1d9/ijson-3.4.0.post0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2c88f0669d45d4b1aa017c9b68d378e7cd15d188dfb6f0209adc78b7f45590a7", size = 62029, upload-time = "2025-10-10T05:28:56.561Z" },
{ url = "https://files.pythonhosted.org/packages/2a/97/e88295f9456ba939d90d4603af28fcabda3b443ef55e709e9381df3daa58/ijson-3.4.0.post0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:56b3089dc28c12492d92cc4896d2be585a89ecae34e25d08c1df88f21815cb50", size = 61776, upload-time = "2025-10-10T05:28:57.401Z" },
{ url = "https://files.pythonhosted.org/packages/1b/9f/0e9c236e720c2de887ab0d7cad8a15d2aa55fb449f792437fc99899957a9/ijson-3.4.0.post0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:c117321cfa7b749cc1213f9b4c80dc958f0a206df98ec038ae4bcbbdb8463a15", size = 199808, upload-time = "2025-10-10T05:28:58.62Z" },
{ url = "https://files.pythonhosted.org/packages/0e/70/c21de30e7013e074924cd82057acfc5760e7b2cc41180f80770621b0ad36/ijson-3.4.0.post0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8311f48db6a33116db5c81682f08b6e2405501a4b4e460193ae69fec3cd1f87a", size = 217152, upload-time = "2025-10-10T05:28:59.656Z" },
{ url = "https://files.pythonhosted.org/packages/64/78/63a0bcc0707037df4e22bb836451279d850592258c859685a402c27f5d6d/ijson-3.4.0.post0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:91c61a3e63e04da648737e6b4abd537df1b46fb8cdf3219b072e790bb3c1a46b", size = 207663, upload-time = "2025-10-10T05:29:00.73Z" },
{ url = "https://files.pythonhosted.org/packages/7d/85/834e9838d69893cb7567e1210be044444213c78f7414aaf1cd241df16078/ijson-3.4.0.post0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1709171023ce82651b2f132575c2e6282e47f64ad67bd3260da476418d0e7895", size = 211157, upload-time = "2025-10-10T05:29:01.87Z" },
{ url = "https://files.pythonhosted.org/packages/2e/9b/9fda503799ebc30397710552e5dedc1d98d9ea6a694e5717415892623a94/ijson-3.4.0.post0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:5f0a72b1e3c0f78551670c12b2fdc1bf05f2796254d9c2055ba319bec2216020", size = 200231, upload-time = "2025-10-10T05:29:02.883Z" },
{ url = "https://files.pythonhosted.org/packages/15/f3/6419d1d5795a16591233d3aa3747b084e82c0c1d7184bdad9be638174560/ijson-3.4.0.post0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b982a3597b0439ce9c8f4cfc929d86c6ed43907908be1e8463a34dc35fe5b258", size = 204825, upload-time = "2025-10-10T05:29:04.242Z" },
{ url = "https://files.pythonhosted.org/packages/43/66/27cfcea16e85b95e33814eae2052dab187206b8820cdd90aa39d32ffb441/ijson-3.4.0.post0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:add9242f886eae844a7410b84aee2bbb8bdc83c624f227cb1fdb2d0476a96cb1", size = 57029, upload-time = "2025-10-10T05:29:19.733Z" },
{ url = "https://files.pythonhosted.org/packages/b8/1b/df3f1561c6629241fb2f8bd7ea1da14e3c2dd16fe9d7cbc97120870ed09c/ijson-3.4.0.post0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:69718ed41710dfcaa7564b0af42abc05875d4f7aaa24627c808867ef32634bc7", size = 56523, upload-time = "2025-10-10T05:29:20.641Z" },
{ url = "https://files.pythonhosted.org/packages/39/0a/6c6a3221ddecf62b696fde0e864415237e05b9a36ab6685a606b8fb3b5a2/ijson-3.4.0.post0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:636b6eca96c6c43c04629c6b37fad0181662eaacf9877c71c698485637f752f9", size = 70546, upload-time = "2025-10-10T05:29:21.526Z" },
{ url = "https://files.pythonhosted.org/packages/42/cb/edf69755e86a3a9f8b418efd60239cb308af46c7c8e12f869423f51c9851/ijson-3.4.0.post0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb5e73028f6e63d27b3d286069fe350ed80a4ccc493b022b590fea4bb086710d", size = 70532, upload-time = "2025-10-10T05:29:22.718Z" },
{ url = "https://files.pythonhosted.org/packages/96/7e/c8730ea39b8712622cd5a1bdff676098208400e37bb92052ba52f93e2aa1/ijson-3.4.0.post0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:461acf4320219459dabe5ed90a45cb86c9ba8cc6d6db9dad0d9427d42f57794c", size = 67927, upload-time = "2025-10-10T05:29:23.596Z" },
]
[[package]] [[package]]
name = "imagehash" name = "imagehash"
version = "4.3.2" version = "4.3.2"
@@ -3191,6 +3267,7 @@ dependencies = [
{ name = "flower", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "flower", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "gotenberg-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "gotenberg-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "httpx-oauth", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "httpx-oauth", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "ijson", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "imap-tools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "imap-tools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "langdetect", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "langdetect", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3214,6 +3291,7 @@ dependencies = [
{ name = "rapidfuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "rapidfuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "sentence-transformers", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "sentence-transformers", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3340,6 +3418,8 @@ requires-dist = [
{ name = "gotenberg-client", specifier = "~=0.13.1" }, { name = "gotenberg-client", specifier = "~=0.13.1" },
{ name = "granian", extras = ["uvloop"], marker = "extra == 'webserver'", specifier = "~=2.6.0" }, { name = "granian", extras = ["uvloop"], marker = "extra == 'webserver'", specifier = "~=2.6.0" },
{ name = "httpx-oauth", specifier = "~=0.16" }, { name = "httpx-oauth", specifier = "~=0.16" },
{ name = "ijson" },
{ name = "ijson", specifier = "~=3.3" },
{ name = "imap-tools", specifier = "~=1.11.0" }, { name = "imap-tools", specifier = "~=1.11.0" },
{ name = "jinja2", specifier = "~=3.1.5" }, { name = "jinja2", specifier = "~=3.1.5" },
{ name = "langdetect", specifier = "~=1.0.9" }, { name = "langdetect", specifier = "~=1.0.9" },
@@ -3369,6 +3449,7 @@ requires-dist = [
{ name = "rapidfuzz", specifier = "~=3.14.0" }, { name = "rapidfuzz", specifier = "~=3.14.0" },
{ name = "redis", extras = ["hiredis"], specifier = "~=5.2.1" }, { name = "redis", extras = ["hiredis"], specifier = "~=5.2.1" },
{ name = "regex", specifier = ">=2025.9.18" }, { name = "regex", specifier = ">=2025.9.18" },
{ name = "rich", specifier = "~=14.1.0" },
{ name = "scikit-learn", specifier = "~=1.7.0" }, { name = "scikit-learn", specifier = "~=1.7.0" },
{ name = "sentence-transformers", specifier = ">=4.1" }, { name = "sentence-transformers", specifier = ">=4.1" },
{ name = "setproctitle", specifier = "~=1.3.4" }, { name = "setproctitle", specifier = "~=1.3.4" },
@@ -4662,15 +4743,15 @@ wheels = [
[[package]] [[package]]
name = "rich" name = "rich"
version = "14.3.1" version = "14.1.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "markdown-it-py", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "markdown-it-py", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pygments", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pygments", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/a1/84/4831f881aa6ff3c976f6d6809b58cdfa350593ffc0dc3c58f5f6586780fb/rich-14.3.1.tar.gz", hash = "sha256:b8c5f568a3a749f9290ec6bddedf835cec33696bfc1e48bcfecb276c7386e4b8", size = 230125, upload-time = "2026-01-24T21:40:44.847Z" } sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" }
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/87/2a/a1810c8627b9ec8c57ec5ec325d306701ae7be50235e8fd81266e002a3cc/rich-14.3.1-py3-none-any.whl", hash = "sha256:da750b1aebbff0b372557426fb3f35ba56de8ef954b3190315eb64076d6fb54e", size = 309952, upload-time = "2026-01-24T21:40:42.969Z" }, { url = "https://files.pythonhosted.org/packages/e3/30/3c4d035596d3cf444529e0b2953ad0466f6049528a879d27534700580395/rich-14.1.0-py3-none-any.whl", hash = "sha256:536f5f1785986d6dbdea3c75205c473f970777b4a0d6c6dd1b696aa05a3fa04f", size = 243368, upload-time = "2025-07-25T07:32:56.73Z" },
] ]
[[package]] [[package]]