Separates out the ignore file from the ignore folder and updates documentation

This commit is contained in:
Trenton H
2026-01-12 08:35:43 -08:00
parent d45826eaa2
commit 94a2e6ff58
4 changed files with 162 additions and 99 deletions

View File

@@ -1168,21 +1168,44 @@ don't exist yet.
#### [`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`](#PAPERLESS_CONSUMER_IGNORE_PATTERNS) {#PAPERLESS_CONSUMER_IGNORE_PATTERNS}
: By default, paperless ignores certain files and folders in the
consumption directory, such as system files created by the Mac OS
or hidden folders some tools use to store data.
: Additional regex patterns for files to ignore in the consumption directory. Patterns are matched against filenames only (not full paths)
using Python's `re.match()`, which anchors at the start of the filename.
This can be adjusted by configuring a custom json array with
patterns to exclude.
See the [watchfiles documentation](https://watchfiles.helpmanual.io/api/filters/#watchfiles.BaseFilter.ignore_entity_patterns)
For example, `.DS_STORE/*` will ignore any files found in a folder
named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
This setting is for additional patterns beyond the built-in defaults. Common system files and directories are already ignored automatically.
A pattern like `._*` will ignore anything starting with `._`, including:
`._foo.pdf` and `._bar/foo.pdf`
Example custom patterns:
Defaults to
`[".DS_Store", ".DS_STORE", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*", "Thumbs.db"]`.
```json
["^temp_", "\\.bak$", "^~"]
```
This would ignore:
- Files starting with `temp_` (e.g., `temp_scan.pdf`)
- Files ending with `.bak` (e.g., `document.pdf.bak`)
- Files starting with `~` (e.g., `~$document.docx`)
Defaults to `[]` (empty list, uses only built-in defaults).
The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden.
#### [`PAPERLESS_CONSUMER_IGNORE_DIRS=<json>`](#PAPERLESS_CONSUMER_IGNORE_DIRS) {#PAPERLESS_CONSUMER_IGNORE_DIRS}
: Additional directory names to ignore in the consumption directory. Directories matching these names (and all their contents) will be skipped.
This setting is for additional directories beyond the built-in defaults. Matching is done by directory name only, not full path.
Example:
```json
["temp", "incoming", ".hidden"]
```
Defaults to `[]` (empty list, uses only built-in defaults).
The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden.
#### [`PAPERLESS_CONSUMER_BARCODE_SCANNER=<string>`](#PAPERLESS_CONSUMER_BARCODE_SCANNER) {#PAPERLESS_CONSUMER_BARCODE_SCANNER}
@@ -1283,23 +1306,22 @@ within your documents.
#### [`PAPERLESS_CONSUMER_POLLING_INTERVAL=<num>`](#PAPERLESS_CONSUMER_POLLING_INTERVAL) {#PAPERLESS_CONSUMER_POLLING_INTERVAL}
: If paperless won't find documents added to your consume folder, it
might not be able to automatically detect filesystem changes. In
that case, specify a polling interval in seconds here, which will
then cause paperless to periodically check your consumption
directory for changes. This will also disable listening for file
system changes with `inotify`.
: Configures how the consumer detects new files in the consumption directory.
Defaults to 0, which disables polling and uses filesystem
notifications.
When set to `0` (default), paperless uses native filesystem notifications for efficient, immediate detection of new files.
When set to a positive number, paperless polls the consumption directory at that interval in seconds. Use polling for network filesystems (NFS, SMB/CIFS) where native notifications may not work reliably.
Defaults to 0.
#### [`PAPERLESS_CONSUMER_STABILITY_DELAY=<num>`](#PAPERLESS_CONSUMER_STABILITY_DELAY) {#PAPERLESS_CONSUMER_STABILITY_DELAY}
: Once a file has been detected in the consume folder, it must remain unchanged for this
many seconds before consumption will start on it. If the file is modified, its size changes
or the watching detects any other change on it, the timer will restart.
: Sets the time in seconds that a file must remain unchanged (same size and modification time) before paperless will begin consuming it.
Defaults to 5.
Increase this value if you experience issues with files being consumed before they are fully written, particularly on slower network storage or
with certain scanner quirks
Defaults to 5.0 seconds.
## Workflow webhooks

View File

@@ -9,7 +9,6 @@ native OS notifications and polling fallback.
from __future__ import annotations
import logging
import re
from dataclasses import dataclass
from pathlib import Path
from threading import Event
@@ -57,7 +56,7 @@ class TrackedFile:
self.last_mtime = stat.st_mtime
self.last_size = stat.st_size
return True
except (FileNotFoundError, PermissionError):
except (FileNotFoundError, PermissionError, OSError):
return False
def is_unchanged(self) -> bool:
@@ -68,7 +67,7 @@ class TrackedFile:
try:
stat = self.path.stat()
return stat.st_mtime == self.last_mtime and stat.st_size == self.last_size
except (FileNotFoundError, PermissionError):
except (FileNotFoundError, PermissionError, OSError):
return False
@@ -138,7 +137,7 @@ class FileStabilityTracker:
to_remove: list[Path] = []
to_yield: list[Path] = []
for path, tracked in self._tracked.items():
for path, tracked in list(self._tracked.items()):
time_since_event = current_time - tracked.last_event_time
if time_since_event < self.stability_delay:
@@ -165,7 +164,7 @@ class FileStabilityTracker:
# Not a regular file (directory, symlink, etc.)
to_remove.append(path)
logger.debug(f"Path is not a regular file: {path}")
except (PermissionError, FileNotFoundError) as e:
except (PermissionError, OSError) as e:
logger.warning(f"Cannot access {path}: {e}")
to_remove.append(path)
@@ -190,34 +189,37 @@ class FileStabilityTracker:
class ConsumerFilter(DefaultFilter):
"""
Custom filter for the document consumer.
Filter for watchfiles that accepts only supported document types
and ignores system files/directories.
Filters files based on:
- Supported file extensions
- User-configured ignore patterns (regex)
- Default ignore patterns for common system files
Extends DefaultFilter leveraging its built-in filtering:
- `ignore_dirs`: Directory names to ignore (and all their contents)
- `ignore_entity_patterns`: Regex patterns matched against filename/dirname only
We add custom logic for file extension filtering (only accept supported
document types), which the library doesn't provide.
"""
# Default regex patterns to ignore (matched against filename only)
DEFAULT_IGNORE_PATTERNS: Final[frozenset[str]] = frozenset(
{
r"^\.DS_Store$",
r"^\.DS_STORE$",
r"^\._.*",
r"^desktop\.ini$",
r"^Thumbs\.db$",
},
# Regex patterns for files to always ignore (matched against filename only)
# These are passed to DefaultFilter.ignore_entity_patterns
DEFAULT_IGNORE_PATTERNS: Final[tuple[str, ...]] = (
r"^\.DS_Store$",
r"^\.DS_STORE$",
r"^\._.*",
r"^desktop\.ini$",
r"^Thumbs\.db$",
)
# Directories to always ignore (matched by name via DefaultFilter)
# Directories to always ignore (passed to DefaultFilter.ignore_dirs)
# These are matched by directory name, not full path
DEFAULT_IGNORE_DIRS: Final[tuple[str, ...]] = (
".stfolder",
".stversions",
".localized",
"@eaDir",
".Spotlight-V100",
".Trashes",
"__MACOSX",
".stfolder", # Syncthing
".stversions", # Syncthing
".localized", # macOS
"@eaDir", # Synology NAS
".Spotlight-V100", # macOS
".Trashes", # macOS
"__MACOSX", # macOS archive artifacts
)
def __init__(
@@ -225,38 +227,37 @@ class ConsumerFilter(DefaultFilter):
*,
supported_extensions: frozenset[str] | None = None,
ignore_patterns: list[str] | None = None,
consumption_dir: Path | None = None,
ignore_dirs: list[str] | None = None,
) -> None:
"""
Initialize the consumer filter.
Args:
supported_extensions: Set of supported file extensions (e.g., {".pdf", ".png"}).
If None, uses get_supported_file_extensions().
supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}).
If None, uses get_supported_file_extensions().
ignore_patterns: Additional regex patterns to ignore (matched against filename).
consumption_dir: Base consumption directory (unused, kept for API compatibility).
ignore_dirs: Additional directory names to ignore (merged with defaults).
"""
# Combine default and user patterns
all_patterns = set(self.DEFAULT_IGNORE_PATTERNS)
if ignore_patterns:
all_patterns.update(ignore_patterns)
# Compile all patterns
self._ignore_regexes: list[re.Pattern[str]] = [
re.compile(pattern) for pattern in all_patterns
]
# Get supported extensions
if supported_extensions is None:
supported_extensions = frozenset(get_supported_file_extensions())
self._supported_extensions = supported_extensions
# Call parent with directory ignore list
# DefaultFilter.ignore_dirs matches directory names, not full paths
# Combine default and user patterns
all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
if ignore_patterns:
all_patterns.extend(ignore_patterns)
# Combine default and user ignore_dirs
all_ignore_dirs: list[str] = list(self.DEFAULT_IGNORE_DIRS)
if ignore_dirs:
all_ignore_dirs.extend(ignore_dirs)
# Let DefaultFilter handle all the pattern and directory filtering
super().__init__(
ignore_dirs=self.DEFAULT_IGNORE_DIRS,
ignore_entity_patterns=None,
ignore_paths=None,
ignore_dirs=tuple(all_ignore_dirs),
ignore_entity_patterns=tuple(all_patterns),
ignore_paths=(),
)
def __call__(self, change: Change, path: str) -> bool:
@@ -264,39 +265,32 @@ class ConsumerFilter(DefaultFilter):
Filter function for watchfiles.
Returns True if the path should be watched, False to ignore.
The parent DefaultFilter handles:
- Hidden files/directories (starting with .)
- Directories in ignore_dirs
- Files/directories matching ignore_entity_patterns
We additionally filter files by extension.
"""
# Let parent filter handle directory ignoring and basic checks
# Let parent filter handle directory ignoring and pattern matching
if not super().__call__(change, path):
return False
path_obj = Path(path)
# For directories, parent filter already handled ignore_dirs
# For directories, parent filter already handled everything
if path_obj.is_dir():
return True
# For files, check extension
if not self._has_supported_extension(path_obj):
return False
# Check filename against ignore patterns
return not self._matches_ignore_pattern(path_obj.name)
return self._has_supported_extension(path_obj)
def _has_supported_extension(self, path: Path) -> bool:
"""Check if the file has a supported extension."""
suffix = path.suffix.lower()
return suffix in self._supported_extensions
def _matches_ignore_pattern(self, filename: str) -> bool:
"""Check if the filename matches any ignore pattern."""
for regex in self._ignore_regexes:
if regex.match(filename):
logger.debug(
f"Filename {filename} matched ignore pattern {regex.pattern}",
)
return True
return False
def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
"""
@@ -338,7 +332,7 @@ def _consume_file(
if not filepath.is_file():
logger.debug(f"Not consuming {filepath}: not a file or doesn't exist")
return
except (PermissionError, FileNotFoundError) as e:
except (PermissionError, OSError) as e:
logger.warning(f"Not consuming {filepath}: {e}")
return
@@ -347,7 +341,7 @@ def _consume_file(
if subdirs_as_tags:
try:
tag_ids = _tags_from_path(filepath, consumption_dir)
except Exception: # pragma: nocover
except Exception:
logger.exception(f"Error creating tags from path for {filepath}")
# Queue for consumption
@@ -404,7 +398,7 @@ class Command(BaseCommand):
# Resolve consumption directory
directory = options.get("directory")
if not directory:
directory = settings.CONSUMPTION_DIR
directory = getattr(settings, "CONSUMPTION_DIR", None)
if not directory:
raise CommandError("CONSUMPTION_DIR is not configured")
@@ -425,13 +419,14 @@ class Command(BaseCommand):
polling_interval: float = settings.CONSUMER_POLLING_INTERVAL
stability_delay: float = settings.CONSUMER_STABILITY_DELAY
ignore_patterns: list[str] = settings.CONSUMER_IGNORE_PATTERNS
ignore_dirs: list[str] = settings.CONSUMER_IGNORE_DIRS
is_testing: bool = options.get("testing", False)
is_oneshot: bool = options.get("oneshot", False)
# Create filter
consumer_filter = ConsumerFilter(
ignore_patterns=ignore_patterns,
consumption_dir=directory,
ignore_dirs=ignore_dirs,
)
# Process existing files
@@ -559,10 +554,10 @@ class Command(BaseCommand):
elif is_testing:
# In testing, use short timeout to check stop flag
timeout_ms = testing_timeout_ms
else: # pragma: nocover
else:
# No pending files, wait indefinitely
timeout_ms = 0
except KeyboardInterrupt: # pragma: nocover
except KeyboardInterrupt:
logger.info("Received interrupt, stopping consumer")
self.stop_flag.set()

View File

@@ -46,9 +46,6 @@ if TYPE_CHECKING:
from pytest_mock import MockerFixture
# -- Fixtures --
@pytest.fixture
def stability_tracker() -> FileStabilityTracker:
"""Create a FileStabilityTracker with a short delay for testing."""
@@ -355,6 +352,28 @@ class TestConsumerFilter:
for pattern in ConsumerFilter.DEFAULT_IGNORE_PATTERNS:
re.compile(pattern)
def test_custom_ignore_dirs(self, tmp_path: Path) -> None:
"""Test filter respects custom ignore_dirs."""
filter_obj = ConsumerFilter(
supported_extensions=frozenset({".pdf"}),
ignore_dirs=["custom_ignored_dir"],
)
# Custom ignored directory should be rejected
custom_dir = tmp_path / "custom_ignored_dir"
custom_dir.mkdir()
assert filter_obj(Change.added, str(custom_dir)) is False
# Normal directory should be accepted
normal_dir = tmp_path / "normal_dir"
normal_dir.mkdir()
assert filter_obj(Change.added, str(normal_dir)) is True
# Default ignored directories should still be ignored
stfolder = tmp_path / ".stfolder"
stfolder.mkdir()
assert filter_obj(Change.added, str(stfolder)) is False
class TestConsumerFilterDefaults:
"""Tests for ConsumerFilter with default settings."""
@@ -617,6 +636,8 @@ class ConsumerThread(Thread):
def run(self) -> None:
try:
# Use override_settings to avoid polluting global settings
# which would affect other tests running on the same worker
with override_settings(
SCRATCH_DIR=self.scratch_dir,
CONSUMER_RECURSIVE=self.recursive,
@@ -633,8 +654,9 @@ class ConsumerThread(Thread):
except Exception as e:
self.exception = e
finally:
Tag.objects.all().delete()
# Close database connections created in this thread
# Important: Do not perform any database operations here (like Tag cleanup)
# as they create new connections that won't be properly closed
db.connections.close_all()
def stop(self) -> None:
@@ -672,7 +694,7 @@ def start_consumer(
finally:
# Cleanup all threads that were started
for thread in threads:
thread.stop()
thread.stop_and_wait()
failed_threads = []
for thread in threads:
@@ -680,9 +702,11 @@ def start_consumer(
if thread.is_alive():
failed_threads.append(thread)
# Clean up any Tags created by threads
# Clean up any Tags created by threads (they bypass test transaction isolation)
Tag.objects.all().delete()
db.connections.close_all()
if failed_threads:
pytest.fail(
f"{len(failed_threads)} consumer thread(s) did not stop within timeout",
@@ -799,6 +823,8 @@ class TestCommandWatch:
assert thread.is_alive()
finally:
thread.stop_and_wait(timeout=5.0)
# Clean up any Tags created by the thread
Tag.objects.all().delete()
assert not thread.is_alive()
@@ -860,8 +886,15 @@ class TestCommandWatchRecursive:
sample_pdf: Path,
mock_consume_file_delay: MagicMock,
start_consumer: Callable[..., ConsumerThread],
mocker: MockerFixture,
) -> None:
"""Test subdirs_as_tags creates tags from directory names."""
# Mock _tags_from_path to avoid database operations in the consumer thread
mock_tags = mocker.patch(
"documents.management.commands.document_consumer._tags_from_path",
return_value=[1, 2],
)
subdir = consumption_dir / "Invoices" / "2024"
subdir.mkdir(parents=True)
@@ -875,6 +908,7 @@ class TestCommandWatchRecursive:
raise thread.exception
mock_consume_file_delay.delay.assert_called()
mock_tags.assert_called()
call_args = mock_consume_file_delay.delay.call_args
overrides = call_args[0][1]
assert overrides.tag_ids is not None
@@ -934,3 +968,5 @@ class TestCommandWatchEdgeCases:
assert thread.is_alive()
finally:
thread.stop_and_wait(timeout=5.0)
# Clean up any Tags created by the thread
Tag.objects.all().delete()

View File

@@ -1019,7 +1019,7 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES
CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
# Ignore regex patterns, relative to PAPERLESS_CONSUMPTION_DIR
# Ignore regex patterns, matched against filename only
CONSUMER_IGNORE_PATTERNS = list(
json.loads(
os.getenv(
@@ -1029,6 +1029,16 @@ CONSUMER_IGNORE_PATTERNS = list(
),
)
# Directories to always ignore. These are matched by directory name, not full path
CONSUMER_IGNORE_DIRS = list(
json.loads(
os.getenv(
"PAPERLESS_CONSUMER_IGNORE_DIRS",
json.dumps([]),
),
),
)
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean(