From 94a2e6ff583323bc71b51ab44bc502012fc20fdc Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Mon, 12 Jan 2026 08:35:43 -0800
Subject: [PATCH] Separates out the ignore file from the ignore folder and
 updates documentation

---
 docs/configuration.md                         |  68 ++++++---
 .../management/commands/document_consumer.py  | 133 +++++++++---------
 .../tests/test_management_consumer.py         |  48 ++++++-
 src/paperless/settings.py                     |  12 +-
 4 files changed, 162 insertions(+), 99 deletions(-)
diff --git a/docs/configuration.md b/docs/configuration.md
index 68c874183..fa7d35b65 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1168,21 +1168,44 @@ don't exist yet.
 
 #### [`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`](#PAPERLESS_CONSUMER_IGNORE_PATTERNS) {#PAPERLESS_CONSUMER_IGNORE_PATTERNS}
 
-: By default, paperless ignores certain files and folders in the
-consumption directory, such as system files created by the Mac OS
-or hidden folders some tools use to store data.
+: Additional regex patterns for files to ignore in the consumption directory. Patterns are matched against filenames only (not full paths)
+using Python's `re.match()`, which anchors at the start of the filename.
 
-    This can be adjusted by configuring a custom json array with
-    patterns to exclude.
+    See the [watchfiles documentation](https://watchfiles.helpmanual.io/api/filters/#watchfiles.BaseFilter.ignore_entity_patterns)
 
-    For example, `.DS_STORE/*` will ignore any files found in a folder
-    named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
+    This setting is for additional patterns beyond the built-in defaults. Common system files and directories are already ignored automatically.
 
-    A pattern like `._*` will ignore anything starting with `._`, including:
-    `._foo.pdf` and `._bar/foo.pdf`
+    Example custom patterns:
 
-    Defaults to
-    `[".DS_Store", ".DS_STORE", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*", "Thumbs.db"]`.
+    ```json
+    ["^temp_", "\\.bak$", "^~"]
+    ```
+
+    This would ignore:
+
+    - Files starting with `temp_` (e.g., `temp_scan.pdf`)
+    - Files ending with `.bak` (e.g., `document.pdf.bak`)
+    - Files starting with `~` (e.g., `~$document.docx`)
+
+    Defaults to `[]` (empty list, uses only built-in defaults).
+
+    The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden.
+
+#### [`PAPERLESS_CONSUMER_IGNORE_DIRS=<json>`](#PAPERLESS_CONSUMER_IGNORE_DIRS) {#PAPERLESS_CONSUMER_IGNORE_DIRS}
+
+: Additional directory names to ignore in the consumption directory. Directories matching these names (and all their contents) will be skipped.
+
+    This setting is for additional directories beyond the built-in defaults. Matching is done by directory name only, not full path.
+
+    Example:
+
+    ```json
+    ["temp", "incoming", ".hidden"]
+    ```
+
+    Defaults to `[]` (empty list, uses only built-in defaults).
+
+    The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden.
 
 #### [`PAPERLESS_CONSUMER_BARCODE_SCANNER=<string>`](#PAPERLESS_CONSUMER_BARCODE_SCANNER) {#PAPERLESS_CONSUMER_BARCODE_SCANNER}
 
@@ -1283,23 +1306,22 @@ within your documents.
 
 #### [`PAPERLESS_CONSUMER_POLLING_INTERVAL=<num>`](#PAPERLESS_CONSUMER_POLLING_INTERVAL) {#PAPERLESS_CONSUMER_POLLING_INTERVAL}
 
-: If paperless won't find documents added to your consume folder, it
-might not be able to automatically detect filesystem changes. In
-that case, specify a polling interval in seconds here, which will
-then cause paperless to periodically check your consumption
-directory for changes. This will also disable listening for file
-system changes with `inotify`.
+: Configures how the consumer detects new files in the consumption directory.
 
-    Defaults to 0, which disables polling and uses filesystem
-    notifications.
+    When set to `0` (default), paperless uses native filesystem notifications for efficient, immediate detection of new files.
+
+    When set to a positive number, paperless polls the consumption directory at that interval in seconds. Use polling for network filesystems (NFS, SMB/CIFS) where native notifications may not work reliably.
+
+    Defaults to 0.
 
 #### [`PAPERLESS_CONSUMER_STABILITY_DELAY=<num>`](#PAPERLESS_CONSUMER_STABILITY_DELAY) {#PAPERLESS_CONSUMER_STABILITY_DELAY}
 
-: Once a file has been detected in the consume folder, it must remain unchanged for this
-many seconds before consumption will start on it. If the file is modified, its size changes
-or the watching detects any other change on it, the timer will restart.
+: Sets the time in seconds that a file must remain unchanged (same size and modification time) before paperless will begin consuming it.
 
-    Defaults to 5.
+    Increase this value if you experience issues with files being consumed before they are fully written, particularly on slower network storage or
+    with certain scanner quirks
+
+    Defaults to 5.0 seconds.
 
 ## Workflow webhooks
 
diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py
index 82e0d2ead..8f815c6d7 100644
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -9,7 +9,6 @@ native OS notifications and polling fallback.
 from __future__ import annotations
 
 import logging
-import re
 from dataclasses import dataclass
 from pathlib import Path
 from threading import Event
@@ -57,7 +56,7 @@ class TrackedFile:
             self.last_mtime = stat.st_mtime
             self.last_size = stat.st_size
             return True
-        except (FileNotFoundError, PermissionError):
+        except (FileNotFoundError, PermissionError, OSError):
             return False
 
     def is_unchanged(self) -> bool:
@@ -68,7 +67,7 @@ class TrackedFile:
         try:
             stat = self.path.stat()
             return stat.st_mtime == self.last_mtime and stat.st_size == self.last_size
-        except (FileNotFoundError, PermissionError):
+        except (FileNotFoundError, PermissionError, OSError):
             return False
 
 
@@ -138,7 +137,7 @@ class FileStabilityTracker:
         to_remove: list[Path] = []
         to_yield: list[Path] = []
 
-        for path, tracked in self._tracked.items():
+        for path, tracked in list(self._tracked.items()):
             time_since_event = current_time - tracked.last_event_time
 
             if time_since_event < self.stability_delay:
@@ -165,7 +164,7 @@ class FileStabilityTracker:
                     # Not a regular file (directory, symlink, etc.)
                     to_remove.append(path)
                     logger.debug(f"Path is not a regular file: {path}")
-            except (PermissionError, FileNotFoundError) as e:
+            except (PermissionError, OSError) as e:
                 logger.warning(f"Cannot access {path}: {e}")
                 to_remove.append(path)
 
@@ -190,34 +189,37 @@ class FileStabilityTracker:
 
 class ConsumerFilter(DefaultFilter):
     """
-    Custom filter for the document consumer.
+    Filter for watchfiles that accepts only supported document types
+    and ignores system files/directories.
 
-    Filters files based on:
-    - Supported file extensions
-    - User-configured ignore patterns (regex)
-    - Default ignore patterns for common system files
+    Extends DefaultFilter leveraging its built-in filtering:
+    - `ignore_dirs`: Directory names to ignore (and all their contents)
+    - `ignore_entity_patterns`: Regex patterns matched against filename/dirname only
+
+    We add custom logic for file extension filtering (only accept supported
+    document types), which the library doesn't provide.
     """
 
-    # Default regex patterns to ignore (matched against filename only)
-    DEFAULT_IGNORE_PATTERNS: Final[frozenset[str]] = frozenset(
-        {
-            r"^\.DS_Store$",
-            r"^\.DS_STORE$",
-            r"^\._.*",
-            r"^desktop\.ini$",
-            r"^Thumbs\.db$",
-        },
+    # Regex patterns for files to always ignore (matched against filename only)
+    # These are passed to DefaultFilter.ignore_entity_patterns
+    DEFAULT_IGNORE_PATTERNS: Final[tuple[str, ...]] = (
+        r"^\.DS_Store$",
+        r"^\.DS_STORE$",
+        r"^\._.*",
+        r"^desktop\.ini$",
+        r"^Thumbs\.db$",
     )
 
-    # Directories to always ignore (matched by name via DefaultFilter)
+    # Directories to always ignore (passed to DefaultFilter.ignore_dirs)
+    # These are matched by directory name, not full path
     DEFAULT_IGNORE_DIRS: Final[tuple[str, ...]] = (
-        ".stfolder",
-        ".stversions",
-        ".localized",
-        "@eaDir",
-        ".Spotlight-V100",
-        ".Trashes",
-        "__MACOSX",
+        ".stfolder",  # Syncthing
+        ".stversions",  # Syncthing
+        ".localized",  # macOS
+        "@eaDir",  # Synology NAS
+        ".Spotlight-V100",  # macOS
+        ".Trashes",  # macOS
+        "__MACOSX",  # macOS archive artifacts
     )
 
     def __init__(
@@ -225,38 +227,37 @@ class ConsumerFilter(DefaultFilter):
         *,
         supported_extensions: frozenset[str] | None = None,
         ignore_patterns: list[str] | None = None,
-        consumption_dir: Path | None = None,
+        ignore_dirs: list[str] | None = None,
     ) -> None:
         """
         Initialize the consumer filter.
 
         Args:
-            supported_extensions: Set of supported file extensions (e.g., {".pdf", ".png"}).
-                                If None, uses get_supported_file_extensions().
+            supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}).
+                If None, uses get_supported_file_extensions().
             ignore_patterns: Additional regex patterns to ignore (matched against filename).
-            consumption_dir: Base consumption directory (unused, kept for API compatibility).
+            ignore_dirs: Additional directory names to ignore (merged with defaults).
         """
-        # Combine default and user patterns
-        all_patterns = set(self.DEFAULT_IGNORE_PATTERNS)
-        if ignore_patterns:
-            all_patterns.update(ignore_patterns)
-
-        # Compile all patterns
-        self._ignore_regexes: list[re.Pattern[str]] = [
-            re.compile(pattern) for pattern in all_patterns
-        ]
-
         # Get supported extensions
         if supported_extensions is None:
             supported_extensions = frozenset(get_supported_file_extensions())
         self._supported_extensions = supported_extensions
 
-        # Call parent with directory ignore list
-        # DefaultFilter.ignore_dirs matches directory names, not full paths
+        # Combine default and user patterns
+        all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
+        if ignore_patterns:
+            all_patterns.extend(ignore_patterns)
+
+        # Combine default and user ignore_dirs
+        all_ignore_dirs: list[str] = list(self.DEFAULT_IGNORE_DIRS)
+        if ignore_dirs:
+            all_ignore_dirs.extend(ignore_dirs)
+
+        # Let DefaultFilter handle all the pattern and directory filtering
         super().__init__(
-            ignore_dirs=self.DEFAULT_IGNORE_DIRS,
-            ignore_entity_patterns=None,
-            ignore_paths=None,
+            ignore_dirs=tuple(all_ignore_dirs),
+            ignore_entity_patterns=tuple(all_patterns),
+            ignore_paths=(),
         )
 
     def __call__(self, change: Change, path: str) -> bool:
@@ -264,39 +265,32 @@ class ConsumerFilter(DefaultFilter):
         Filter function for watchfiles.
 
         Returns True if the path should be watched, False to ignore.
+
+        The parent DefaultFilter handles:
+        - Hidden files/directories (starting with .)
+        - Directories in ignore_dirs
+        - Files/directories matching ignore_entity_patterns
+
+        We additionally filter files by extension.
         """
-        # Let parent filter handle directory ignoring and basic checks
+        # Let parent filter handle directory ignoring and pattern matching
         if not super().__call__(change, path):
             return False
 
         path_obj = Path(path)
 
-        # For directories, parent filter already handled ignore_dirs
+        # For directories, parent filter already handled everything
         if path_obj.is_dir():
             return True
 
         # For files, check extension
-        if not self._has_supported_extension(path_obj):
-            return False
-
-        # Check filename against ignore patterns
-        return not self._matches_ignore_pattern(path_obj.name)
+        return self._has_supported_extension(path_obj)
 
     def _has_supported_extension(self, path: Path) -> bool:
         """Check if the file has a supported extension."""
         suffix = path.suffix.lower()
         return suffix in self._supported_extensions
 
-    def _matches_ignore_pattern(self, filename: str) -> bool:
-        """Check if the filename matches any ignore pattern."""
-        for regex in self._ignore_regexes:
-            if regex.match(filename):
-                logger.debug(
-                    f"Filename {filename} matched ignore pattern {regex.pattern}",
-                )
-                return True
-        return False
-
 
 def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
     """
@@ -338,7 +332,7 @@ def _consume_file(
         if not filepath.is_file():
             logger.debug(f"Not consuming {filepath}: not a file or doesn't exist")
             return
-    except (PermissionError, FileNotFoundError) as e:
+    except (PermissionError, OSError) as e:
         logger.warning(f"Not consuming {filepath}: {e}")
         return
 
@@ -347,7 +341,7 @@ def _consume_file(
     if subdirs_as_tags:
         try:
             tag_ids = _tags_from_path(filepath, consumption_dir)
-        except Exception:  # pragma: nocover
+        except Exception:
             logger.exception(f"Error creating tags from path for {filepath}")
 
     # Queue for consumption
@@ -404,7 +398,7 @@ class Command(BaseCommand):
         # Resolve consumption directory
         directory = options.get("directory")
         if not directory:
-            directory = settings.CONSUMPTION_DIR
+            directory = getattr(settings, "CONSUMPTION_DIR", None)
         if not directory:
             raise CommandError("CONSUMPTION_DIR is not configured")
 
@@ -425,13 +419,14 @@ class Command(BaseCommand):
         polling_interval: float = settings.CONSUMER_POLLING_INTERVAL
         stability_delay: float = settings.CONSUMER_STABILITY_DELAY
         ignore_patterns: list[str] = settings.CONSUMER_IGNORE_PATTERNS
+        ignore_dirs: list[str] = settings.CONSUMER_IGNORE_DIRS
         is_testing: bool = options.get("testing", False)
         is_oneshot: bool = options.get("oneshot", False)
 
         # Create filter
         consumer_filter = ConsumerFilter(
             ignore_patterns=ignore_patterns,
-            consumption_dir=directory,
+            ignore_dirs=ignore_dirs,
         )
 
         # Process existing files
@@ -559,10 +554,10 @@ class Command(BaseCommand):
                 elif is_testing:
                     # In testing, use short timeout to check stop flag
                     timeout_ms = testing_timeout_ms
-                else:  # pragma: nocover
+                else:
                     # No pending files, wait indefinitely
                     timeout_ms = 0
 
-            except KeyboardInterrupt:  # pragma: nocover
+            except KeyboardInterrupt:
                 logger.info("Received interrupt, stopping consumer")
                 self.stop_flag.set()
diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py
index a9c846045..732e4dfc4 100644
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@@ -46,9 +46,6 @@ if TYPE_CHECKING:
     from pytest_mock import MockerFixture
 
 
-# -- Fixtures --
-
-
 @pytest.fixture
 def stability_tracker() -> FileStabilityTracker:
     """Create a FileStabilityTracker with a short delay for testing."""
@@ -355,6 +352,28 @@ class TestConsumerFilter:
         for pattern in ConsumerFilter.DEFAULT_IGNORE_PATTERNS:
             re.compile(pattern)
 
+    def test_custom_ignore_dirs(self, tmp_path: Path) -> None:
+        """Test filter respects custom ignore_dirs."""
+        filter_obj = ConsumerFilter(
+            supported_extensions=frozenset({".pdf"}),
+            ignore_dirs=["custom_ignored_dir"],
+        )
+
+        # Custom ignored directory should be rejected
+        custom_dir = tmp_path / "custom_ignored_dir"
+        custom_dir.mkdir()
+        assert filter_obj(Change.added, str(custom_dir)) is False
+
+        # Normal directory should be accepted
+        normal_dir = tmp_path / "normal_dir"
+        normal_dir.mkdir()
+        assert filter_obj(Change.added, str(normal_dir)) is True
+
+        # Default ignored directories should still be ignored
+        stfolder = tmp_path / ".stfolder"
+        stfolder.mkdir()
+        assert filter_obj(Change.added, str(stfolder)) is False
+
 
 class TestConsumerFilterDefaults:
     """Tests for ConsumerFilter with default settings."""
@@ -617,6 +636,8 @@ class ConsumerThread(Thread):
 
     def run(self) -> None:
         try:
+            # Use override_settings to avoid polluting global settings
+            # which would affect other tests running on the same worker
             with override_settings(
                 SCRATCH_DIR=self.scratch_dir,
                 CONSUMER_RECURSIVE=self.recursive,
@@ -633,8 +654,9 @@ class ConsumerThread(Thread):
         except Exception as e:
             self.exception = e
         finally:
-            Tag.objects.all().delete()
             # Close database connections created in this thread
+            # Important: Do not perform any database operations here (like Tag cleanup)
+            # as they create new connections that won't be properly closed
             db.connections.close_all()
 
     def stop(self) -> None:
@@ -672,7 +694,7 @@ def start_consumer(
     finally:
         # Cleanup all threads that were started
         for thread in threads:
-            thread.stop()
+            thread.stop_and_wait()
 
         failed_threads = []
         for thread in threads:
@@ -680,9 +702,11 @@ def start_consumer(
             if thread.is_alive():
                 failed_threads.append(thread)
 
-        # Clean up any Tags created by threads
+        # Clean up any Tags created by threads (they bypass test transaction isolation)
         Tag.objects.all().delete()
 
+        db.connections.close_all()
+
         if failed_threads:
             pytest.fail(
                 f"{len(failed_threads)} consumer thread(s) did not stop within timeout",
@@ -799,6 +823,8 @@ class TestCommandWatch:
             assert thread.is_alive()
         finally:
             thread.stop_and_wait(timeout=5.0)
+            # Clean up any Tags created by the thread
+            Tag.objects.all().delete()
 
         assert not thread.is_alive()
 
@@ -860,8 +886,15 @@ class TestCommandWatchRecursive:
         sample_pdf: Path,
         mock_consume_file_delay: MagicMock,
         start_consumer: Callable[..., ConsumerThread],
+        mocker: MockerFixture,
     ) -> None:
         """Test subdirs_as_tags creates tags from directory names."""
+        # Mock _tags_from_path to avoid database operations in the consumer thread
+        mock_tags = mocker.patch(
+            "documents.management.commands.document_consumer._tags_from_path",
+            return_value=[1, 2],
+        )
+
         subdir = consumption_dir / "Invoices" / "2024"
         subdir.mkdir(parents=True)
 
@@ -875,6 +908,7 @@ class TestCommandWatchRecursive:
             raise thread.exception
 
         mock_consume_file_delay.delay.assert_called()
+        mock_tags.assert_called()
         call_args = mock_consume_file_delay.delay.call_args
         overrides = call_args[0][1]
         assert overrides.tag_ids is not None
@@ -934,3 +968,5 @@ class TestCommandWatchEdgeCases:
             assert thread.is_alive()
         finally:
             thread.stop_and_wait(timeout=5.0)
+            # Clean up any Tags created by the thread
+            Tag.objects.all().delete()
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 21d494d42..6bbbb32ae 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1019,7 +1019,7 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES
 
 CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
 
-# Ignore regex patterns, relative to PAPERLESS_CONSUMPTION_DIR
+# Ignore regex patterns, matched against filename only
 CONSUMER_IGNORE_PATTERNS = list(
     json.loads(
         os.getenv(
@@ -1029,6 +1029,16 @@ CONSUMER_IGNORE_PATTERNS = list(
     ),
 )
 
+# Directories to always ignore.  These are matched by directory name, not full path
+CONSUMER_IGNORE_DIRS = list(
+    json.loads(
+        os.getenv(
+            "PAPERLESS_CONSUMER_IGNORE_DIRS",
+            json.dumps([]),
+        ),
+    ),
+)
+
 CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
 
 CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean(