From 94a2e6ff583323bc71b51ab44bc502012fc20fdc Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Mon, 12 Jan 2026 08:35:43 -0800 Subject: [PATCH] Separates out the ignore file from the ignore folder and updates documentation --- docs/configuration.md | 68 ++++++--- .../management/commands/document_consumer.py | 133 +++++++++--------- .../tests/test_management_consumer.py | 48 ++++++- src/paperless/settings.py | 12 +- 4 files changed, 162 insertions(+), 99 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 68c874183..fa7d35b65 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1168,21 +1168,44 @@ don't exist yet. #### [`PAPERLESS_CONSUMER_IGNORE_PATTERNS=`](#PAPERLESS_CONSUMER_IGNORE_PATTERNS) {#PAPERLESS_CONSUMER_IGNORE_PATTERNS} -: By default, paperless ignores certain files and folders in the -consumption directory, such as system files created by the Mac OS -or hidden folders some tools use to store data. +: Additional regex patterns for files to ignore in the consumption directory. Patterns are matched against filenames only (not full paths) +using Python's `re.match()`, which anchors at the start of the filename. - This can be adjusted by configuring a custom json array with - patterns to exclude. + See the [watchfiles documentation](https://watchfiles.helpmanual.io/api/filters/#watchfiles.BaseFilter.ignore_entity_patterns) - For example, `.DS_STORE/*` will ignore any files found in a folder - named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf` + This setting is for additional patterns beyond the built-in defaults. Common system files and directories are already ignored automatically. - A pattern like `._*` will ignore anything starting with `._`, including: - `._foo.pdf` and `._bar/foo.pdf` + Example custom patterns: - Defaults to - `[".DS_Store", ".DS_STORE", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*", "Thumbs.db"]`. + ```json + ["^temp_", "\\.bak$", "^~"] + ``` + + This would ignore: + + - Files starting with `temp_` (e.g., `temp_scan.pdf`) + - Files ending with `.bak` (e.g., `document.pdf.bak`) + - Files starting with `~` (e.g., `~$document.docx`) + + Defaults to `[]` (empty list, uses only built-in defaults). + + The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden. + +#### [`PAPERLESS_CONSUMER_IGNORE_DIRS=`](#PAPERLESS_CONSUMER_IGNORE_DIRS) {#PAPERLESS_CONSUMER_IGNORE_DIRS} + +: Additional directory names to ignore in the consumption directory. Directories matching these names (and all their contents) will be skipped. + + This setting is for additional directories beyond the built-in defaults. Matching is done by directory name only, not full path. + + Example: + + ```json + ["temp", "incoming", ".hidden"] + ``` + + Defaults to `[]` (empty list, uses only built-in defaults). + + The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden. #### [`PAPERLESS_CONSUMER_BARCODE_SCANNER=`](#PAPERLESS_CONSUMER_BARCODE_SCANNER) {#PAPERLESS_CONSUMER_BARCODE_SCANNER} @@ -1283,23 +1306,22 @@ within your documents. #### [`PAPERLESS_CONSUMER_POLLING_INTERVAL=`](#PAPERLESS_CONSUMER_POLLING_INTERVAL) {#PAPERLESS_CONSUMER_POLLING_INTERVAL} -: If paperless won't find documents added to your consume folder, it -might not be able to automatically detect filesystem changes. In -that case, specify a polling interval in seconds here, which will -then cause paperless to periodically check your consumption -directory for changes. This will also disable listening for file -system changes with `inotify`. +: Configures how the consumer detects new files in the consumption directory. - Defaults to 0, which disables polling and uses filesystem - notifications. + When set to `0` (default), paperless uses native filesystem notifications for efficient, immediate detection of new files. + + When set to a positive number, paperless polls the consumption directory at that interval in seconds. Use polling for network filesystems (NFS, SMB/CIFS) where native notifications may not work reliably. + + Defaults to 0. #### [`PAPERLESS_CONSUMER_STABILITY_DELAY=`](#PAPERLESS_CONSUMER_STABILITY_DELAY) {#PAPERLESS_CONSUMER_STABILITY_DELAY} -: Once a file has been detected in the consume folder, it must remain unchanged for this -many seconds before consumption will start on it. If the file is modified, its size changes -or the watching detects any other change on it, the timer will restart. +: Sets the time in seconds that a file must remain unchanged (same size and modification time) before paperless will begin consuming it. - Defaults to 5. + Increase this value if you experience issues with files being consumed before they are fully written, particularly on slower network storage or + with certain scanner quirks + + Defaults to 5.0 seconds. ## Workflow webhooks diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 82e0d2ead..8f815c6d7 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -9,7 +9,6 @@ native OS notifications and polling fallback. from __future__ import annotations import logging -import re from dataclasses import dataclass from pathlib import Path from threading import Event @@ -57,7 +56,7 @@ class TrackedFile: self.last_mtime = stat.st_mtime self.last_size = stat.st_size return True - except (FileNotFoundError, PermissionError): + except (FileNotFoundError, PermissionError, OSError): return False def is_unchanged(self) -> bool: @@ -68,7 +67,7 @@ class TrackedFile: try: stat = self.path.stat() return stat.st_mtime == self.last_mtime and stat.st_size == self.last_size - except (FileNotFoundError, PermissionError): + except (FileNotFoundError, PermissionError, OSError): return False @@ -138,7 +137,7 @@ class FileStabilityTracker: to_remove: list[Path] = [] to_yield: list[Path] = [] - for path, tracked in self._tracked.items(): + for path, tracked in list(self._tracked.items()): time_since_event = current_time - tracked.last_event_time if time_since_event < self.stability_delay: @@ -165,7 +164,7 @@ class FileStabilityTracker: # Not a regular file (directory, symlink, etc.) to_remove.append(path) logger.debug(f"Path is not a regular file: {path}") - except (PermissionError, FileNotFoundError) as e: + except (PermissionError, OSError) as e: logger.warning(f"Cannot access {path}: {e}") to_remove.append(path) @@ -190,34 +189,37 @@ class FileStabilityTracker: class ConsumerFilter(DefaultFilter): """ - Custom filter for the document consumer. + Filter for watchfiles that accepts only supported document types + and ignores system files/directories. - Filters files based on: - - Supported file extensions - - User-configured ignore patterns (regex) - - Default ignore patterns for common system files + Extends DefaultFilter leveraging its built-in filtering: + - `ignore_dirs`: Directory names to ignore (and all their contents) + - `ignore_entity_patterns`: Regex patterns matched against filename/dirname only + + We add custom logic for file extension filtering (only accept supported + document types), which the library doesn't provide. """ - # Default regex patterns to ignore (matched against filename only) - DEFAULT_IGNORE_PATTERNS: Final[frozenset[str]] = frozenset( - { - r"^\.DS_Store$", - r"^\.DS_STORE$", - r"^\._.*", - r"^desktop\.ini$", - r"^Thumbs\.db$", - }, + # Regex patterns for files to always ignore (matched against filename only) + # These are passed to DefaultFilter.ignore_entity_patterns + DEFAULT_IGNORE_PATTERNS: Final[tuple[str, ...]] = ( + r"^\.DS_Store$", + r"^\.DS_STORE$", + r"^\._.*", + r"^desktop\.ini$", + r"^Thumbs\.db$", ) - # Directories to always ignore (matched by name via DefaultFilter) + # Directories to always ignore (passed to DefaultFilter.ignore_dirs) + # These are matched by directory name, not full path DEFAULT_IGNORE_DIRS: Final[tuple[str, ...]] = ( - ".stfolder", - ".stversions", - ".localized", - "@eaDir", - ".Spotlight-V100", - ".Trashes", - "__MACOSX", + ".stfolder", # Syncthing + ".stversions", # Syncthing + ".localized", # macOS + "@eaDir", # Synology NAS + ".Spotlight-V100", # macOS + ".Trashes", # macOS + "__MACOSX", # macOS archive artifacts ) def __init__( @@ -225,38 +227,37 @@ class ConsumerFilter(DefaultFilter): *, supported_extensions: frozenset[str] | None = None, ignore_patterns: list[str] | None = None, - consumption_dir: Path | None = None, + ignore_dirs: list[str] | None = None, ) -> None: """ Initialize the consumer filter. Args: - supported_extensions: Set of supported file extensions (e.g., {".pdf", ".png"}). - If None, uses get_supported_file_extensions(). + supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}). + If None, uses get_supported_file_extensions(). ignore_patterns: Additional regex patterns to ignore (matched against filename). - consumption_dir: Base consumption directory (unused, kept for API compatibility). + ignore_dirs: Additional directory names to ignore (merged with defaults). """ - # Combine default and user patterns - all_patterns = set(self.DEFAULT_IGNORE_PATTERNS) - if ignore_patterns: - all_patterns.update(ignore_patterns) - - # Compile all patterns - self._ignore_regexes: list[re.Pattern[str]] = [ - re.compile(pattern) for pattern in all_patterns - ] - # Get supported extensions if supported_extensions is None: supported_extensions = frozenset(get_supported_file_extensions()) self._supported_extensions = supported_extensions - # Call parent with directory ignore list - # DefaultFilter.ignore_dirs matches directory names, not full paths + # Combine default and user patterns + all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS) + if ignore_patterns: + all_patterns.extend(ignore_patterns) + + # Combine default and user ignore_dirs + all_ignore_dirs: list[str] = list(self.DEFAULT_IGNORE_DIRS) + if ignore_dirs: + all_ignore_dirs.extend(ignore_dirs) + + # Let DefaultFilter handle all the pattern and directory filtering super().__init__( - ignore_dirs=self.DEFAULT_IGNORE_DIRS, - ignore_entity_patterns=None, - ignore_paths=None, + ignore_dirs=tuple(all_ignore_dirs), + ignore_entity_patterns=tuple(all_patterns), + ignore_paths=(), ) def __call__(self, change: Change, path: str) -> bool: @@ -264,39 +265,32 @@ class ConsumerFilter(DefaultFilter): Filter function for watchfiles. Returns True if the path should be watched, False to ignore. + + The parent DefaultFilter handles: + - Hidden files/directories (starting with .) + - Directories in ignore_dirs + - Files/directories matching ignore_entity_patterns + + We additionally filter files by extension. """ - # Let parent filter handle directory ignoring and basic checks + # Let parent filter handle directory ignoring and pattern matching if not super().__call__(change, path): return False path_obj = Path(path) - # For directories, parent filter already handled ignore_dirs + # For directories, parent filter already handled everything if path_obj.is_dir(): return True # For files, check extension - if not self._has_supported_extension(path_obj): - return False - - # Check filename against ignore patterns - return not self._matches_ignore_pattern(path_obj.name) + return self._has_supported_extension(path_obj) def _has_supported_extension(self, path: Path) -> bool: """Check if the file has a supported extension.""" suffix = path.suffix.lower() return suffix in self._supported_extensions - def _matches_ignore_pattern(self, filename: str) -> bool: - """Check if the filename matches any ignore pattern.""" - for regex in self._ignore_regexes: - if regex.match(filename): - logger.debug( - f"Filename {filename} matched ignore pattern {regex.pattern}", - ) - return True - return False - def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]: """ @@ -338,7 +332,7 @@ def _consume_file( if not filepath.is_file(): logger.debug(f"Not consuming {filepath}: not a file or doesn't exist") return - except (PermissionError, FileNotFoundError) as e: + except (PermissionError, OSError) as e: logger.warning(f"Not consuming {filepath}: {e}") return @@ -347,7 +341,7 @@ def _consume_file( if subdirs_as_tags: try: tag_ids = _tags_from_path(filepath, consumption_dir) - except Exception: # pragma: nocover + except Exception: logger.exception(f"Error creating tags from path for {filepath}") # Queue for consumption @@ -404,7 +398,7 @@ class Command(BaseCommand): # Resolve consumption directory directory = options.get("directory") if not directory: - directory = settings.CONSUMPTION_DIR + directory = getattr(settings, "CONSUMPTION_DIR", None) if not directory: raise CommandError("CONSUMPTION_DIR is not configured") @@ -425,13 +419,14 @@ class Command(BaseCommand): polling_interval: float = settings.CONSUMER_POLLING_INTERVAL stability_delay: float = settings.CONSUMER_STABILITY_DELAY ignore_patterns: list[str] = settings.CONSUMER_IGNORE_PATTERNS + ignore_dirs: list[str] = settings.CONSUMER_IGNORE_DIRS is_testing: bool = options.get("testing", False) is_oneshot: bool = options.get("oneshot", False) # Create filter consumer_filter = ConsumerFilter( ignore_patterns=ignore_patterns, - consumption_dir=directory, + ignore_dirs=ignore_dirs, ) # Process existing files @@ -559,10 +554,10 @@ class Command(BaseCommand): elif is_testing: # In testing, use short timeout to check stop flag timeout_ms = testing_timeout_ms - else: # pragma: nocover + else: # No pending files, wait indefinitely timeout_ms = 0 - except KeyboardInterrupt: # pragma: nocover + except KeyboardInterrupt: logger.info("Received interrupt, stopping consumer") self.stop_flag.set() diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index a9c846045..732e4dfc4 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -46,9 +46,6 @@ if TYPE_CHECKING: from pytest_mock import MockerFixture -# -- Fixtures -- - - @pytest.fixture def stability_tracker() -> FileStabilityTracker: """Create a FileStabilityTracker with a short delay for testing.""" @@ -355,6 +352,28 @@ class TestConsumerFilter: for pattern in ConsumerFilter.DEFAULT_IGNORE_PATTERNS: re.compile(pattern) + def test_custom_ignore_dirs(self, tmp_path: Path) -> None: + """Test filter respects custom ignore_dirs.""" + filter_obj = ConsumerFilter( + supported_extensions=frozenset({".pdf"}), + ignore_dirs=["custom_ignored_dir"], + ) + + # Custom ignored directory should be rejected + custom_dir = tmp_path / "custom_ignored_dir" + custom_dir.mkdir() + assert filter_obj(Change.added, str(custom_dir)) is False + + # Normal directory should be accepted + normal_dir = tmp_path / "normal_dir" + normal_dir.mkdir() + assert filter_obj(Change.added, str(normal_dir)) is True + + # Default ignored directories should still be ignored + stfolder = tmp_path / ".stfolder" + stfolder.mkdir() + assert filter_obj(Change.added, str(stfolder)) is False + class TestConsumerFilterDefaults: """Tests for ConsumerFilter with default settings.""" @@ -617,6 +636,8 @@ class ConsumerThread(Thread): def run(self) -> None: try: + # Use override_settings to avoid polluting global settings + # which would affect other tests running on the same worker with override_settings( SCRATCH_DIR=self.scratch_dir, CONSUMER_RECURSIVE=self.recursive, @@ -633,8 +654,9 @@ class ConsumerThread(Thread): except Exception as e: self.exception = e finally: - Tag.objects.all().delete() # Close database connections created in this thread + # Important: Do not perform any database operations here (like Tag cleanup) + # as they create new connections that won't be properly closed db.connections.close_all() def stop(self) -> None: @@ -672,7 +694,7 @@ def start_consumer( finally: # Cleanup all threads that were started for thread in threads: - thread.stop() + thread.stop_and_wait() failed_threads = [] for thread in threads: @@ -680,9 +702,11 @@ def start_consumer( if thread.is_alive(): failed_threads.append(thread) - # Clean up any Tags created by threads + # Clean up any Tags created by threads (they bypass test transaction isolation) Tag.objects.all().delete() + db.connections.close_all() + if failed_threads: pytest.fail( f"{len(failed_threads)} consumer thread(s) did not stop within timeout", @@ -799,6 +823,8 @@ class TestCommandWatch: assert thread.is_alive() finally: thread.stop_and_wait(timeout=5.0) + # Clean up any Tags created by the thread + Tag.objects.all().delete() assert not thread.is_alive() @@ -860,8 +886,15 @@ class TestCommandWatchRecursive: sample_pdf: Path, mock_consume_file_delay: MagicMock, start_consumer: Callable[..., ConsumerThread], + mocker: MockerFixture, ) -> None: """Test subdirs_as_tags creates tags from directory names.""" + # Mock _tags_from_path to avoid database operations in the consumer thread + mock_tags = mocker.patch( + "documents.management.commands.document_consumer._tags_from_path", + return_value=[1, 2], + ) + subdir = consumption_dir / "Invoices" / "2024" subdir.mkdir(parents=True) @@ -875,6 +908,7 @@ class TestCommandWatchRecursive: raise thread.exception mock_consume_file_delay.delay.assert_called() + mock_tags.assert_called() call_args = mock_consume_file_delay.delay.call_args overrides = call_args[0][1] assert overrides.tag_ids is not None @@ -934,3 +968,5 @@ class TestCommandWatchEdgeCases: assert thread.is_alive() finally: thread.stop_and_wait(timeout=5.0) + # Clean up any Tags created by the thread + Tag.objects.all().delete() diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 21d494d42..6bbbb32ae 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -1019,7 +1019,7 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE") -# Ignore regex patterns, relative to PAPERLESS_CONSUMPTION_DIR +# Ignore regex patterns, matched against filename only CONSUMER_IGNORE_PATTERNS = list( json.loads( os.getenv( @@ -1029,6 +1029,16 @@ CONSUMER_IGNORE_PATTERNS = list( ), ) +# Directories to always ignore. These are matched by directory name, not full path +CONSUMER_IGNORE_DIRS = list( + json.loads( + os.getenv( + "PAPERLESS_CONSUMER_IGNORE_DIRS", + json.dumps([]), + ), + ), +) + CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean(