mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-14 21:54:22 -06:00
564 lines
19 KiB
Python
564 lines
19 KiB
Python
"""
|
|
Document consumer management command.
|
|
|
|
Watches a consumption directory for new documents and queues them for processing.
|
|
Uses watchfiles for efficient file system monitoring with support for both
|
|
native OS notifications and polling fallback.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from threading import Event
|
|
from time import monotonic
|
|
from typing import TYPE_CHECKING
|
|
from typing import Final
|
|
|
|
from django import db
|
|
from django.conf import settings
|
|
from django.core.management.base import BaseCommand
|
|
from django.core.management.base import CommandError
|
|
from watchfiles import Change
|
|
from watchfiles import DefaultFilter
|
|
from watchfiles import watch
|
|
|
|
from documents.data_models import ConsumableDocument
|
|
from documents.data_models import DocumentMetadataOverrides
|
|
from documents.data_models import DocumentSource
|
|
from documents.models import Tag
|
|
from documents.parsers import get_supported_file_extensions
|
|
from documents.tasks import consume_file
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Iterator
|
|
|
|
|
|
logger = logging.getLogger("paperless.management.consumer")
|
|
|
|
|
|
@dataclass
|
|
class TrackedFile:
|
|
"""Represents a file being tracked for stability."""
|
|
|
|
path: Path
|
|
last_event_time: float
|
|
last_mtime: float | None = None
|
|
last_size: int | None = None
|
|
|
|
def update_stats(self) -> bool:
|
|
"""
|
|
Update file stats. Returns True if file exists and stats were updated.
|
|
"""
|
|
try:
|
|
stat = self.path.stat()
|
|
self.last_mtime = stat.st_mtime
|
|
self.last_size = stat.st_size
|
|
return True
|
|
except (FileNotFoundError, PermissionError, OSError):
|
|
return False
|
|
|
|
def is_unchanged(self) -> bool:
|
|
"""
|
|
Check if file stats match the previously recorded values.
|
|
Returns False if file doesn't exist or stats changed.
|
|
"""
|
|
try:
|
|
stat = self.path.stat()
|
|
return stat.st_mtime == self.last_mtime and stat.st_size == self.last_size
|
|
except (FileNotFoundError, PermissionError, OSError):
|
|
return False
|
|
|
|
|
|
class FileStabilityTracker:
|
|
"""
|
|
Tracks file events and determines when files are stable for consumption.
|
|
|
|
A file is considered stable when:
|
|
1. No new events have been received for it within the stability delay
|
|
2. Its size and modification time haven't changed
|
|
3. It still exists as a regular file
|
|
|
|
This handles various edge cases:
|
|
- Network copies that write in chunks
|
|
- Scanners that open/close files multiple times
|
|
- Temporary files that get renamed
|
|
- Files that are deleted before becoming stable
|
|
"""
|
|
|
|
def __init__(self, stability_delay: float = 1.0) -> None:
|
|
"""
|
|
Initialize the tracker.
|
|
|
|
Args:
|
|
stability_delay: Time in seconds a file must remain unchanged
|
|
before being considered stable.
|
|
"""
|
|
self.stability_delay = stability_delay
|
|
self._tracked: dict[Path, TrackedFile] = {}
|
|
|
|
def track(self, path: Path, change: Change) -> None:
|
|
"""
|
|
Register a file event.
|
|
|
|
Args:
|
|
path: The file path that changed.
|
|
change: The type of change (added, modified, deleted).
|
|
"""
|
|
path = path.resolve()
|
|
|
|
match change:
|
|
case Change.deleted:
|
|
self._tracked.pop(path, None)
|
|
logger.debug(f"Stopped tracking deleted file: {path}")
|
|
case Change.added | Change.modified:
|
|
current_time = monotonic()
|
|
if path in self._tracked:
|
|
tracked = self._tracked[path]
|
|
tracked.last_event_time = current_time
|
|
tracked.update_stats()
|
|
logger.debug(f"Updated tracking for: {path}")
|
|
else:
|
|
tracked = TrackedFile(path=path, last_event_time=current_time)
|
|
if tracked.update_stats():
|
|
self._tracked[path] = tracked
|
|
logger.debug(f"Started tracking: {path}")
|
|
else:
|
|
logger.debug(f"Could not stat file, not tracking: {path}")
|
|
|
|
def get_stable_files(self) -> Iterator[Path]:
|
|
"""
|
|
Yield files that have been stable for the configured delay.
|
|
|
|
Files are removed from tracking once yielded or determined to be invalid.
|
|
"""
|
|
current_time = monotonic()
|
|
to_remove: list[Path] = []
|
|
to_yield: list[Path] = []
|
|
|
|
for path, tracked in list(self._tracked.items()):
|
|
time_since_event = current_time - tracked.last_event_time
|
|
|
|
if time_since_event < self.stability_delay:
|
|
continue
|
|
|
|
# File has waited long enough, verify it's unchanged
|
|
if not tracked.is_unchanged():
|
|
# Stats changed or file gone - update and wait again
|
|
if tracked.update_stats():
|
|
tracked.last_event_time = current_time
|
|
logger.debug(f"File changed during stability check: {path}")
|
|
else:
|
|
# File no longer exists, remove from tracking
|
|
to_remove.append(path)
|
|
logger.debug(f"File disappeared during stability check: {path}")
|
|
continue
|
|
|
|
# File is stable - verify it's a regular file
|
|
try:
|
|
if path.is_file():
|
|
to_yield.append(path)
|
|
logger.info(f"File is stable: {path}")
|
|
else:
|
|
# Not a regular file (directory, symlink, etc.)
|
|
to_remove.append(path)
|
|
logger.debug(f"Path is not a regular file: {path}")
|
|
except (PermissionError, OSError) as e:
|
|
logger.warning(f"Cannot access {path}: {e}")
|
|
to_remove.append(path)
|
|
|
|
# Remove files that are no longer valid
|
|
for path in to_remove:
|
|
self._tracked.pop(path, None)
|
|
|
|
# Remove and yield stable files
|
|
for path in to_yield:
|
|
self._tracked.pop(path, None)
|
|
yield path
|
|
|
|
def has_pending_files(self) -> bool:
|
|
"""Check if there are files waiting for stability check."""
|
|
return len(self._tracked) > 0
|
|
|
|
@property
|
|
def pending_count(self) -> int:
|
|
"""Number of files being tracked."""
|
|
return len(self._tracked)
|
|
|
|
|
|
class ConsumerFilter(DefaultFilter):
|
|
"""
|
|
Filter for watchfiles that accepts only supported document types
|
|
and ignores system files/directories.
|
|
|
|
Extends DefaultFilter leveraging its built-in filtering:
|
|
- `ignore_dirs`: Directory names to ignore (and all their contents)
|
|
- `ignore_entity_patterns`: Regex patterns matched against filename/dirname only
|
|
|
|
We add custom logic for file extension filtering (only accept supported
|
|
document types), which the library doesn't provide.
|
|
"""
|
|
|
|
# Regex patterns for files to always ignore (matched against filename only)
|
|
# These are passed to DefaultFilter.ignore_entity_patterns
|
|
DEFAULT_IGNORE_PATTERNS: Final[tuple[str, ...]] = (
|
|
r"^\.DS_Store$",
|
|
r"^\.DS_STORE$",
|
|
r"^\._.*",
|
|
r"^desktop\.ini$",
|
|
r"^Thumbs\.db$",
|
|
)
|
|
|
|
# Directories to always ignore (passed to DefaultFilter.ignore_dirs)
|
|
# These are matched by directory name, not full path
|
|
DEFAULT_IGNORE_DIRS: Final[tuple[str, ...]] = (
|
|
".stfolder", # Syncthing
|
|
".stversions", # Syncthing
|
|
".localized", # macOS
|
|
"@eaDir", # Synology NAS
|
|
".Spotlight-V100", # macOS
|
|
".Trashes", # macOS
|
|
"__MACOSX", # macOS archive artifacts
|
|
)
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
supported_extensions: frozenset[str] | None = None,
|
|
ignore_patterns: list[str] | None = None,
|
|
ignore_dirs: list[str] | None = None,
|
|
) -> None:
|
|
"""
|
|
Initialize the consumer filter.
|
|
|
|
Args:
|
|
supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}).
|
|
If None, uses get_supported_file_extensions().
|
|
ignore_patterns: Additional regex patterns to ignore (matched against filename).
|
|
ignore_dirs: Additional directory names to ignore (merged with defaults).
|
|
"""
|
|
# Get supported extensions
|
|
if supported_extensions is None:
|
|
supported_extensions = frozenset(get_supported_file_extensions())
|
|
self._supported_extensions = supported_extensions
|
|
|
|
# Combine default and user patterns
|
|
all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
|
|
if ignore_patterns:
|
|
all_patterns.extend(ignore_patterns)
|
|
|
|
# Combine default and user ignore_dirs
|
|
all_ignore_dirs: list[str] = list(self.DEFAULT_IGNORE_DIRS)
|
|
if ignore_dirs:
|
|
all_ignore_dirs.extend(ignore_dirs)
|
|
|
|
# Let DefaultFilter handle all the pattern and directory filtering
|
|
super().__init__(
|
|
ignore_dirs=tuple(all_ignore_dirs),
|
|
ignore_entity_patterns=tuple(all_patterns),
|
|
ignore_paths=(),
|
|
)
|
|
|
|
def __call__(self, change: Change, path: str) -> bool:
|
|
"""
|
|
Filter function for watchfiles.
|
|
|
|
Returns True if the path should be watched, False to ignore.
|
|
|
|
The parent DefaultFilter handles:
|
|
- Hidden files/directories (starting with .)
|
|
- Directories in ignore_dirs
|
|
- Files/directories matching ignore_entity_patterns
|
|
|
|
We additionally filter files by extension.
|
|
"""
|
|
# Let parent filter handle directory ignoring and pattern matching
|
|
if not super().__call__(change, path):
|
|
return False
|
|
|
|
path_obj = Path(path)
|
|
|
|
# For directories, parent filter already handled everything
|
|
if path_obj.is_dir():
|
|
return True
|
|
|
|
# For files, check extension
|
|
return self._has_supported_extension(path_obj)
|
|
|
|
def _has_supported_extension(self, path: Path) -> bool:
|
|
"""Check if the file has a supported extension."""
|
|
suffix = path.suffix.lower()
|
|
return suffix in self._supported_extensions
|
|
|
|
|
|
def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
|
|
"""
|
|
Walk up the directory tree from filepath to consumption_dir
|
|
and get or create Tag IDs for every directory.
|
|
|
|
Returns list of Tag primary keys.
|
|
"""
|
|
db.close_old_connections()
|
|
tag_ids: set[int] = set()
|
|
path_parts = filepath.relative_to(consumption_dir).parent.parts
|
|
|
|
for part in path_parts:
|
|
tag, _ = Tag.objects.get_or_create(
|
|
name__iexact=part,
|
|
defaults={"name": part},
|
|
)
|
|
tag_ids.add(tag.pk)
|
|
|
|
return list(tag_ids)
|
|
|
|
|
|
def _consume_file(
|
|
filepath: Path,
|
|
consumption_dir: Path,
|
|
*,
|
|
subdirs_as_tags: bool,
|
|
) -> None:
|
|
"""
|
|
Queue a file for consumption.
|
|
|
|
Args:
|
|
filepath: Path to the file to consume.
|
|
consumption_dir: Base consumption directory.
|
|
subdirs_as_tags: Whether to create tags from subdirectory names.
|
|
"""
|
|
# Verify file still exists and is accessible
|
|
try:
|
|
if not filepath.is_file():
|
|
logger.debug(f"Not consuming {filepath}: not a file or doesn't exist")
|
|
return
|
|
except (PermissionError, OSError) as e:
|
|
logger.warning(f"Not consuming {filepath}: {e}")
|
|
return
|
|
|
|
# Get tags from path if configured
|
|
tag_ids: list[int] | None = None
|
|
if subdirs_as_tags:
|
|
try:
|
|
tag_ids = _tags_from_path(filepath, consumption_dir)
|
|
except Exception:
|
|
logger.exception(f"Error creating tags from path for {filepath}")
|
|
|
|
# Queue for consumption
|
|
try:
|
|
logger.info(f"Adding {filepath} to the task queue")
|
|
consume_file.delay(
|
|
ConsumableDocument(
|
|
source=DocumentSource.ConsumeFolder,
|
|
original_file=filepath,
|
|
),
|
|
DocumentMetadataOverrides(tag_ids=tag_ids),
|
|
)
|
|
except Exception:
|
|
logger.exception(f"Error while queuing document {filepath}")
|
|
|
|
|
|
class Command(BaseCommand):
|
|
"""
|
|
Watch a consumption directory and queue new documents for processing.
|
|
|
|
Uses watchfiles for efficient file system monitoring. Supports both
|
|
native OS notifications (inotify on Linux, FSEvents on macOS) and
|
|
polling for network filesystems.
|
|
"""
|
|
|
|
help = "Watch the consumption directory for new documents"
|
|
|
|
# For testing - allows tests to stop the consumer
|
|
stop_flag: Event = Event()
|
|
|
|
# Testing timeout in seconds
|
|
testing_timeout_s: Final[float] = 0.5
|
|
|
|
def add_arguments(self, parser) -> None:
|
|
parser.add_argument(
|
|
"directory",
|
|
default=None,
|
|
nargs="?",
|
|
help="The consumption directory (defaults to CONSUMPTION_DIR setting)",
|
|
)
|
|
parser.add_argument(
|
|
"--oneshot",
|
|
action="store_true",
|
|
help="Process existing files and exit without watching",
|
|
)
|
|
parser.add_argument(
|
|
"--testing",
|
|
action="store_true",
|
|
help="Enable testing mode with shorter timeouts",
|
|
default=False,
|
|
)
|
|
|
|
def handle(self, *args, **options) -> None:
|
|
# Resolve consumption directory
|
|
directory = options.get("directory")
|
|
if not directory:
|
|
directory = getattr(settings, "CONSUMPTION_DIR", None)
|
|
if not directory:
|
|
raise CommandError("CONSUMPTION_DIR is not configured")
|
|
|
|
directory = Path(directory).resolve()
|
|
|
|
if not directory.exists():
|
|
raise CommandError(f"Consumption directory does not exist: {directory}")
|
|
|
|
if not directory.is_dir():
|
|
raise CommandError(f"Consumption path is not a directory: {directory}")
|
|
|
|
# Ensure scratch directory exists
|
|
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Get settings
|
|
recursive: bool = settings.CONSUMER_RECURSIVE
|
|
subdirs_as_tags: bool = settings.CONSUMER_SUBDIRS_AS_TAGS
|
|
polling_interval: float = settings.CONSUMER_POLLING_INTERVAL
|
|
stability_delay: float = settings.CONSUMER_STABILITY_DELAY
|
|
ignore_patterns: list[str] = settings.CONSUMER_IGNORE_PATTERNS
|
|
ignore_dirs: list[str] = settings.CONSUMER_IGNORE_DIRS
|
|
is_testing: bool = options.get("testing", False)
|
|
is_oneshot: bool = options.get("oneshot", False)
|
|
|
|
# Create filter
|
|
consumer_filter = ConsumerFilter(
|
|
ignore_patterns=ignore_patterns,
|
|
ignore_dirs=ignore_dirs,
|
|
)
|
|
|
|
# Process existing files
|
|
self._process_existing_files(
|
|
directory=directory,
|
|
recursive=recursive,
|
|
subdirs_as_tags=subdirs_as_tags,
|
|
consumer_filter=consumer_filter,
|
|
)
|
|
|
|
if is_oneshot:
|
|
logger.info("Oneshot mode: processed existing files, exiting")
|
|
return
|
|
|
|
# Start watching
|
|
self._watch_directory(
|
|
directory=directory,
|
|
recursive=recursive,
|
|
subdirs_as_tags=subdirs_as_tags,
|
|
consumer_filter=consumer_filter,
|
|
polling_interval=polling_interval,
|
|
stability_delay=stability_delay,
|
|
is_testing=is_testing,
|
|
)
|
|
|
|
logger.debug("Consumer exiting")
|
|
|
|
def _process_existing_files(
|
|
self,
|
|
*,
|
|
directory: Path,
|
|
recursive: bool,
|
|
subdirs_as_tags: bool,
|
|
consumer_filter: ConsumerFilter,
|
|
) -> None:
|
|
"""Process any existing files in the consumption directory."""
|
|
logger.info(f"Processing existing files in {directory}")
|
|
|
|
glob_pattern = "**/*" if recursive else "*"
|
|
|
|
for filepath in directory.glob(glob_pattern):
|
|
# Use filter to check if file should be processed
|
|
if not filepath.is_file():
|
|
continue
|
|
|
|
if not consumer_filter(Change.added, str(filepath)):
|
|
continue
|
|
|
|
_consume_file(
|
|
filepath=filepath,
|
|
consumption_dir=directory,
|
|
subdirs_as_tags=subdirs_as_tags,
|
|
)
|
|
|
|
def _watch_directory(
|
|
self,
|
|
*,
|
|
directory: Path,
|
|
recursive: bool,
|
|
subdirs_as_tags: bool,
|
|
consumer_filter: ConsumerFilter,
|
|
polling_interval: float,
|
|
stability_delay: float,
|
|
is_testing: bool,
|
|
) -> None:
|
|
"""Watch directory for changes and process stable files."""
|
|
use_polling = polling_interval > 0
|
|
poll_delay_ms = int(polling_interval * 1000) if use_polling else 0
|
|
|
|
if use_polling:
|
|
logger.info(
|
|
f"Watching {directory} using polling (interval: {polling_interval}s)",
|
|
)
|
|
else:
|
|
logger.info(f"Watching {directory} using native file system events")
|
|
|
|
# Create stability tracker
|
|
tracker = FileStabilityTracker(stability_delay=stability_delay)
|
|
|
|
# Calculate timeouts
|
|
stability_timeout_ms = int(stability_delay * 1000)
|
|
testing_timeout_ms = int(self.testing_timeout_s * 1000)
|
|
|
|
# Start with no timeout (wait indefinitely for first event)
|
|
# unless in testing mode
|
|
timeout_ms = testing_timeout_ms if is_testing else 0
|
|
|
|
self.stop_flag.clear()
|
|
|
|
while not self.stop_flag.is_set():
|
|
try:
|
|
for changes in watch(
|
|
directory,
|
|
watch_filter=consumer_filter,
|
|
rust_timeout=timeout_ms,
|
|
yield_on_timeout=True,
|
|
force_polling=use_polling,
|
|
poll_delay_ms=poll_delay_ms,
|
|
recursive=recursive,
|
|
stop_event=self.stop_flag,
|
|
):
|
|
# Process each change
|
|
for change_type, path in changes:
|
|
path = Path(path).resolve()
|
|
if not path.is_file():
|
|
continue
|
|
logger.debug(f"Event: {change_type.name} for {path}")
|
|
tracker.track(path, change_type)
|
|
|
|
# Check for stable files
|
|
for stable_path in tracker.get_stable_files():
|
|
_consume_file(
|
|
filepath=stable_path,
|
|
consumption_dir=directory,
|
|
subdirs_as_tags=subdirs_as_tags,
|
|
)
|
|
|
|
# Exit watch loop to reconfigure timeout
|
|
break
|
|
|
|
# Determine next timeout
|
|
if tracker.has_pending_files():
|
|
# Check pending files at stability interval
|
|
timeout_ms = stability_timeout_ms
|
|
elif is_testing:
|
|
# In testing, use short timeout to check stop flag
|
|
timeout_ms = testing_timeout_ms
|
|
else:
|
|
# No pending files, wait indefinitely
|
|
timeout_ms = 0
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Received interrupt, stopping consumer")
|
|
self.stop_flag.set()
|