Separates out the ignore file from the ignore folder and updates documentation

2026-01-12 21:44:21 -06:00 · 2026-01-12 08:35:43 -08:00
parent d45826eaa2
commit 94a2e6ff58
4 changed files with 162 additions and 99 deletions
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1168,21 +1168,44 @@ don't exist yet.
 #### [`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`](#PAPERLESS_CONSUMER_IGNORE_PATTERNS) {#PAPERLESS_CONSUMER_IGNORE_PATTERNS}
-: By default, paperless ignores certain files and folders in the
+: Additional regex patterns for files to ignore in the consumption directory. Patterns are matched against filenames only (not full paths)
-consumption directory, such as system files created by the Mac OS
+using Python's `re.match()`, which anchors at the start of the filename.
 or hidden folders some tools use to store data.
-    This can be adjusted by configuring a custom json array with
+    See the [watchfiles documentation](https://watchfiles.helpmanual.io/api/filters/#watchfiles.BaseFilter.ignore_entity_patterns)
    patterns to exclude.
-    For example, `.DS_STORE/*` will ignore any files found in a folder
+    This setting is for additional patterns beyond the built-in defaults. Common system files and directories are already ignored automatically.
    named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
-    A pattern like `._*` will ignore anything starting with `._`, including:
+    Example custom patterns:
    `._foo.pdf` and `._bar/foo.pdf`
-    Defaults to
+    ```json
-    `[".DS_Store", ".DS_STORE", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*", "Thumbs.db"]`.
+    ["^temp_", "\\.bak$", "^~"]
    ```
    This would ignore:
    - Files starting with `temp_` (e.g., `temp_scan.pdf`)
    - Files ending with `.bak` (e.g., `document.pdf.bak`)
    - Files starting with `~` (e.g., `~$document.docx`)
    Defaults to `[]` (empty list, uses only built-in defaults).
    The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden.
 #### [`PAPERLESS_CONSUMER_IGNORE_DIRS=<json>`](#PAPERLESS_CONSUMER_IGNORE_DIRS) {#PAPERLESS_CONSUMER_IGNORE_DIRS}
 : Additional directory names to ignore in the consumption directory. Directories matching these names (and all their contents) will be skipped.
    This setting is for additional directories beyond the built-in defaults. Matching is done by directory name only, not full path.
    Example:
    ```json
    ["temp", "incoming", ".hidden"]
    ```
    Defaults to `[]` (empty list, uses only built-in defaults).
    The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden.
 #### [`PAPERLESS_CONSUMER_BARCODE_SCANNER=<string>`](#PAPERLESS_CONSUMER_BARCODE_SCANNER) {#PAPERLESS_CONSUMER_BARCODE_SCANNER}
@@ -1283,23 +1306,22 @@ within your documents.
 #### [`PAPERLESS_CONSUMER_POLLING_INTERVAL=<num>`](#PAPERLESS_CONSUMER_POLLING_INTERVAL) {#PAPERLESS_CONSUMER_POLLING_INTERVAL}
-: If paperless won't find documents added to your consume folder, it
+: Configures how the consumer detects new files in the consumption directory.
 might not be able to automatically detect filesystem changes. In
 that case, specify a polling interval in seconds here, which will
 then cause paperless to periodically check your consumption
 directory for changes. This will also disable listening for file
 system changes with `inotify`.
-    Defaults to 0, which disables polling and uses filesystem
+    When set to `0` (default), paperless uses native filesystem notifications for efficient, immediate detection of new files.
-    notifications.
+
    When set to a positive number, paperless polls the consumption directory at that interval in seconds. Use polling for network filesystems (NFS, SMB/CIFS) where native notifications may not work reliably.
    Defaults to 0.
 #### [`PAPERLESS_CONSUMER_STABILITY_DELAY=<num>`](#PAPERLESS_CONSUMER_STABILITY_DELAY) {#PAPERLESS_CONSUMER_STABILITY_DELAY}
-: Once a file has been detected in the consume folder, it must remain unchanged for this
+: Sets the time in seconds that a file must remain unchanged (same size and modification time) before paperless will begin consuming it.
 many seconds before consumption will start on it. If the file is modified, its size changes
 or the watching detects any other change on it, the timer will restart.
-    Defaults to 5.
+    Increase this value if you experience issues with files being consumed before they are fully written, particularly on slower network storage or
    with certain scanner quirks
    Defaults to 5.0 seconds.
 ## Workflow webhooks
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -9,7 +9,6 @@ native OS notifications and polling fallback.
 from __future__ import annotations
 import logging
 import re
 from dataclasses import dataclass
 from pathlib import Path
 from threading import Event
@@ -57,7 +56,7 @@ class TrackedFile:
            self.last_mtime = stat.st_mtime
            self.last_size = stat.st_size
            return True
-        except (FileNotFoundError, PermissionError):
+        except (FileNotFoundError, PermissionError, OSError):
            return False
    def is_unchanged(self) -> bool:
@@ -68,7 +67,7 @@ class TrackedFile:
        try:
            stat = self.path.stat()
            return stat.st_mtime == self.last_mtime and stat.st_size == self.last_size
-        except (FileNotFoundError, PermissionError):
+        except (FileNotFoundError, PermissionError, OSError):
            return False
@@ -138,7 +137,7 @@ class FileStabilityTracker:
        to_remove: list[Path] = []
        to_yield: list[Path] = []
-        for path, tracked in self._tracked.items():
+        for path, tracked in list(self._tracked.items()):
            time_since_event = current_time - tracked.last_event_time
            if time_since_event < self.stability_delay:
@@ -165,7 +164,7 @@ class FileStabilityTracker:
                    # Not a regular file (directory, symlink, etc.)
                    to_remove.append(path)
                    logger.debug(f"Path is not a regular file: {path}")
-            except (PermissionError, FileNotFoundError) as e:
+            except (PermissionError, OSError) as e:
                logger.warning(f"Cannot access {path}: {e}")
                to_remove.append(path)
@@ -190,34 +189,37 @@ class FileStabilityTracker:
 class ConsumerFilter(DefaultFilter):
    """
-    Custom filter for the document consumer.
+    Filter for watchfiles that accepts only supported document types
    and ignores system files/directories.
-    Filters files based on:
+    Extends DefaultFilter leveraging its built-in filtering:
-    - Supported file extensions
+    - `ignore_dirs`: Directory names to ignore (and all their contents)
-    - User-configured ignore patterns (regex)
+    - `ignore_entity_patterns`: Regex patterns matched against filename/dirname only
-    - Default ignore patterns for common system files
+
    We add custom logic for file extension filtering (only accept supported
    document types), which the library doesn't provide.
    """
-    # Default regex patterns to ignore (matched against filename only)
+    # Regex patterns for files to always ignore (matched against filename only)
-    DEFAULT_IGNORE_PATTERNS: Final[frozenset[str]] = frozenset(
+    # These are passed to DefaultFilter.ignore_entity_patterns
-        {
+    DEFAULT_IGNORE_PATTERNS: Final[tuple[str, ...]] = (
-            r"^\.DS_Store$",
+        r"^\.DS_Store$",
-            r"^\.DS_STORE$",
+        r"^\.DS_STORE$",
-            r"^\._.*",
+        r"^\._.*",
-            r"^desktop\.ini$",
+        r"^desktop\.ini$",
-            r"^Thumbs\.db$",
+        r"^Thumbs\.db$",
        },
    )
-    # Directories to always ignore (matched by name via DefaultFilter)
+    # Directories to always ignore (passed to DefaultFilter.ignore_dirs)
    # These are matched by directory name, not full path
    DEFAULT_IGNORE_DIRS: Final[tuple[str, ...]] = (
-        ".stfolder",
+        ".stfolder",  # Syncthing
-        ".stversions",
+        ".stversions",  # Syncthing
-        ".localized",
+        ".localized",  # macOS
-        "@eaDir",
+        "@eaDir",  # Synology NAS
-        ".Spotlight-V100",
+        ".Spotlight-V100",  # macOS
-        ".Trashes",
+        ".Trashes",  # macOS
-        "__MACOSX",
+        "__MACOSX",  # macOS archive artifacts
    )
    def __init__(
@@ -225,38 +227,37 @@ class ConsumerFilter(DefaultFilter):
        *,
        supported_extensions: frozenset[str] | None = None,
        ignore_patterns: list[str] | None = None,
-        consumption_dir: Path | None = None,
+        ignore_dirs: list[str] | None = None,
    ) -> None:
        """
        Initialize the consumer filter.
        Args:
-            supported_extensions: Set of supported file extensions (e.g., {".pdf", ".png"}).
+            supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}).
-                                If None, uses get_supported_file_extensions().
+                If None, uses get_supported_file_extensions().
            ignore_patterns: Additional regex patterns to ignore (matched against filename).
-            consumption_dir: Base consumption directory (unused, kept for API compatibility).
+            ignore_dirs: Additional directory names to ignore (merged with defaults).
        """
        # Combine default and user patterns
        all_patterns = set(self.DEFAULT_IGNORE_PATTERNS)
        if ignore_patterns:
            all_patterns.update(ignore_patterns)
        # Compile all patterns
        self._ignore_regexes: list[re.Pattern[str]] = [
            re.compile(pattern) for pattern in all_patterns
        ]
        # Get supported extensions
        if supported_extensions is None:
            supported_extensions = frozenset(get_supported_file_extensions())
        self._supported_extensions = supported_extensions
-        # Call parent with directory ignore list
+        # Combine default and user patterns
-        # DefaultFilter.ignore_dirs matches directory names, not full paths
+        all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
        if ignore_patterns:
            all_patterns.extend(ignore_patterns)
        # Combine default and user ignore_dirs
        all_ignore_dirs: list[str] = list(self.DEFAULT_IGNORE_DIRS)
        if ignore_dirs:
            all_ignore_dirs.extend(ignore_dirs)
        # Let DefaultFilter handle all the pattern and directory filtering
        super().__init__(
-            ignore_dirs=self.DEFAULT_IGNORE_DIRS,
+            ignore_dirs=tuple(all_ignore_dirs),
-            ignore_entity_patterns=None,
+            ignore_entity_patterns=tuple(all_patterns),
-            ignore_paths=None,
+            ignore_paths=(),
        )
    def __call__(self, change: Change, path: str) -> bool:
@@ -264,39 +265,32 @@ class ConsumerFilter(DefaultFilter):
        Filter function for watchfiles.
        Returns True if the path should be watched, False to ignore.
        The parent DefaultFilter handles:
        - Hidden files/directories (starting with .)
        - Directories in ignore_dirs
        - Files/directories matching ignore_entity_patterns
        We additionally filter files by extension.
        """
-        # Let parent filter handle directory ignoring and basic checks
+        # Let parent filter handle directory ignoring and pattern matching
        if not super().__call__(change, path):
            return False
        path_obj = Path(path)
-        # For directories, parent filter already handled ignore_dirs
+        # For directories, parent filter already handled everything
        if path_obj.is_dir():
            return True
        # For files, check extension
-        if not self._has_supported_extension(path_obj):
+        return self._has_supported_extension(path_obj)
            return False
        # Check filename against ignore patterns
        return not self._matches_ignore_pattern(path_obj.name)
    def _has_supported_extension(self, path: Path) -> bool:
        """Check if the file has a supported extension."""
        suffix = path.suffix.lower()
        return suffix in self._supported_extensions
    def _matches_ignore_pattern(self, filename: str) -> bool:
        """Check if the filename matches any ignore pattern."""
        for regex in self._ignore_regexes:
            if regex.match(filename):
                logger.debug(
                    f"Filename {filename} matched ignore pattern {regex.pattern}",
                )
                return True
        return False
 def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
    """
@@ -338,7 +332,7 @@ def _consume_file(
        if not filepath.is_file():
            logger.debug(f"Not consuming {filepath}: not a file or doesn't exist")
            return
-    except (PermissionError, FileNotFoundError) as e:
+    except (PermissionError, OSError) as e:
        logger.warning(f"Not consuming {filepath}: {e}")
        return
@@ -347,7 +341,7 @@ def _consume_file(
    if subdirs_as_tags:
        try:
            tag_ids = _tags_from_path(filepath, consumption_dir)
-        except Exception:  # pragma: nocover
+        except Exception:
            logger.exception(f"Error creating tags from path for {filepath}")
    # Queue for consumption
@@ -404,7 +398,7 @@ class Command(BaseCommand):
        # Resolve consumption directory
        directory = options.get("directory")
        if not directory:
-            directory = settings.CONSUMPTION_DIR
+            directory = getattr(settings, "CONSUMPTION_DIR", None)
        if not directory:
            raise CommandError("CONSUMPTION_DIR is not configured")
@@ -425,13 +419,14 @@ class Command(BaseCommand):
        polling_interval: float = settings.CONSUMER_POLLING_INTERVAL
        stability_delay: float = settings.CONSUMER_STABILITY_DELAY
        ignore_patterns: list[str] = settings.CONSUMER_IGNORE_PATTERNS
        ignore_dirs: list[str] = settings.CONSUMER_IGNORE_DIRS
        is_testing: bool = options.get("testing", False)
        is_oneshot: bool = options.get("oneshot", False)
        # Create filter
        consumer_filter = ConsumerFilter(
            ignore_patterns=ignore_patterns,
-            consumption_dir=directory,
+            ignore_dirs=ignore_dirs,
        )
        # Process existing files
@@ -559,10 +554,10 @@ class Command(BaseCommand):
                elif is_testing:
                    # In testing, use short timeout to check stop flag
                    timeout_ms = testing_timeout_ms
-                else:  # pragma: nocover
+                else:
                    # No pending files, wait indefinitely
                    timeout_ms = 0
-            except KeyboardInterrupt:  # pragma: nocover
+            except KeyboardInterrupt:
                logger.info("Received interrupt, stopping consumer")
                self.stop_flag.set()
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@@ -46,9 +46,6 @@ if TYPE_CHECKING:
    from pytest_mock import MockerFixture
 # -- Fixtures --
@pytest.fixture
 def stability_tracker() -> FileStabilityTracker:
    """Create a FileStabilityTracker with a short delay for testing."""
@@ -355,6 +352,28 @@ class TestConsumerFilter:
        for pattern in ConsumerFilter.DEFAULT_IGNORE_PATTERNS:
            re.compile(pattern)
    def test_custom_ignore_dirs(self, tmp_path: Path) -> None:
        """Test filter respects custom ignore_dirs."""
        filter_obj = ConsumerFilter(
            supported_extensions=frozenset({".pdf"}),
            ignore_dirs=["custom_ignored_dir"],
        )
        # Custom ignored directory should be rejected
        custom_dir = tmp_path / "custom_ignored_dir"
        custom_dir.mkdir()
        assert filter_obj(Change.added, str(custom_dir)) is False
        # Normal directory should be accepted
        normal_dir = tmp_path / "normal_dir"
        normal_dir.mkdir()
        assert filter_obj(Change.added, str(normal_dir)) is True
        # Default ignored directories should still be ignored
        stfolder = tmp_path / ".stfolder"
        stfolder.mkdir()
        assert filter_obj(Change.added, str(stfolder)) is False
 class TestConsumerFilterDefaults:
    """Tests for ConsumerFilter with default settings."""
@@ -617,6 +636,8 @@ class ConsumerThread(Thread):
    def run(self) -> None:
        try:
            # Use override_settings to avoid polluting global settings
            # which would affect other tests running on the same worker
            with override_settings(
                SCRATCH_DIR=self.scratch_dir,
                CONSUMER_RECURSIVE=self.recursive,
@@ -633,8 +654,9 @@ class ConsumerThread(Thread):
        except Exception as e:
            self.exception = e
        finally:
            Tag.objects.all().delete()
            # Close database connections created in this thread
            # Important: Do not perform any database operations here (like Tag cleanup)
            # as they create new connections that won't be properly closed
            db.connections.close_all()
    def stop(self) -> None:
@@ -672,7 +694,7 @@ def start_consumer(
    finally:
        # Cleanup all threads that were started
        for thread in threads:
-            thread.stop()
+            thread.stop_and_wait()
        failed_threads = []
        for thread in threads:
@@ -680,9 +702,11 @@ def start_consumer(
            if thread.is_alive():
                failed_threads.append(thread)
-        # Clean up any Tags created by threads
+        # Clean up any Tags created by threads (they bypass test transaction isolation)
        Tag.objects.all().delete()
        db.connections.close_all()
        if failed_threads:
            pytest.fail(
                f"{len(failed_threads)} consumer thread(s) did not stop within timeout",
@@ -799,6 +823,8 @@ class TestCommandWatch:
            assert thread.is_alive()
        finally:
            thread.stop_and_wait(timeout=5.0)
            # Clean up any Tags created by the thread
            Tag.objects.all().delete()
        assert not thread.is_alive()
@@ -860,8 +886,15 @@ class TestCommandWatchRecursive:
        sample_pdf: Path,
        mock_consume_file_delay: MagicMock,
        start_consumer: Callable[..., ConsumerThread],
        mocker: MockerFixture,
    ) -> None:
        """Test subdirs_as_tags creates tags from directory names."""
        # Mock _tags_from_path to avoid database operations in the consumer thread
        mock_tags = mocker.patch(
            "documents.management.commands.document_consumer._tags_from_path",
            return_value=[1, 2],
        )
        subdir = consumption_dir / "Invoices" / "2024"
        subdir.mkdir(parents=True)
@@ -875,6 +908,7 @@ class TestCommandWatchRecursive:
            raise thread.exception
        mock_consume_file_delay.delay.assert_called()
        mock_tags.assert_called()
        call_args = mock_consume_file_delay.delay.call_args
        overrides = call_args[0][1]
        assert overrides.tag_ids is not None
@@ -934,3 +968,5 @@ class TestCommandWatchEdgeCases:
            assert thread.is_alive()
        finally:
            thread.stop_and_wait(timeout=5.0)
            # Clean up any Tags created by the thread
            Tag.objects.all().delete()
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1019,7 +1019,7 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES
 CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
-# Ignore regex patterns, relative to PAPERLESS_CONSUMPTION_DIR
+# Ignore regex patterns, matched against filename only
 CONSUMER_IGNORE_PATTERNS = list(
    json.loads(
        os.getenv(
@@ -1029,6 +1029,16 @@ CONSUMER_IGNORE_PATTERNS = list(
    ),
 )
 # Directories to always ignore.  These are matched by directory name, not full path
 CONSUMER_IGNORE_DIRS = list(
    json.loads(
        os.getenv(
            "PAPERLESS_CONSUMER_IGNORE_DIRS",
            json.dumps([]),
        ),
    ),
 )
 CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
 CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean(