Separates out the ignore file from the ignore folder and updates documentation

This commit is contained in:
Trenton H
2026-01-12 08:35:43 -08:00
parent d45826eaa2
commit 94a2e6ff58
4 changed files with 162 additions and 99 deletions

View File

@@ -1168,21 +1168,44 @@ don't exist yet.
#### [`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`](#PAPERLESS_CONSUMER_IGNORE_PATTERNS) {#PAPERLESS_CONSUMER_IGNORE_PATTERNS} #### [`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`](#PAPERLESS_CONSUMER_IGNORE_PATTERNS) {#PAPERLESS_CONSUMER_IGNORE_PATTERNS}
: By default, paperless ignores certain files and folders in the : Additional regex patterns for files to ignore in the consumption directory. Patterns are matched against filenames only (not full paths)
consumption directory, such as system files created by the Mac OS using Python's `re.match()`, which anchors at the start of the filename.
or hidden folders some tools use to store data.
This can be adjusted by configuring a custom json array with See the [watchfiles documentation](https://watchfiles.helpmanual.io/api/filters/#watchfiles.BaseFilter.ignore_entity_patterns)
patterns to exclude.
For example, `.DS_STORE/*` will ignore any files found in a folder This setting is for additional patterns beyond the built-in defaults. Common system files and directories are already ignored automatically.
named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
A pattern like `._*` will ignore anything starting with `._`, including: Example custom patterns:
`._foo.pdf` and `._bar/foo.pdf`
Defaults to ```json
`[".DS_Store", ".DS_STORE", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*", "Thumbs.db"]`. ["^temp_", "\\.bak$", "^~"]
```
This would ignore:
- Files starting with `temp_` (e.g., `temp_scan.pdf`)
- Files ending with `.bak` (e.g., `document.pdf.bak`)
- Files starting with `~` (e.g., `~$document.docx`)
Defaults to `[]` (empty list, uses only built-in defaults).
The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden.
#### [`PAPERLESS_CONSUMER_IGNORE_DIRS=<json>`](#PAPERLESS_CONSUMER_IGNORE_DIRS) {#PAPERLESS_CONSUMER_IGNORE_DIRS}
: Additional directory names to ignore in the consumption directory. Directories matching these names (and all their contents) will be skipped.
This setting is for additional directories beyond the built-in defaults. Matching is done by directory name only, not full path.
Example:
```json
["temp", "incoming", ".hidden"]
```
Defaults to `[]` (empty list, uses only built-in defaults).
The default ignores are `[.DS_Store, .DS_STORE, ._*, desktop.ini, Thumbs.db]` and cannot be overridden.
#### [`PAPERLESS_CONSUMER_BARCODE_SCANNER=<string>`](#PAPERLESS_CONSUMER_BARCODE_SCANNER) {#PAPERLESS_CONSUMER_BARCODE_SCANNER} #### [`PAPERLESS_CONSUMER_BARCODE_SCANNER=<string>`](#PAPERLESS_CONSUMER_BARCODE_SCANNER) {#PAPERLESS_CONSUMER_BARCODE_SCANNER}
@@ -1283,23 +1306,22 @@ within your documents.
#### [`PAPERLESS_CONSUMER_POLLING_INTERVAL=<num>`](#PAPERLESS_CONSUMER_POLLING_INTERVAL) {#PAPERLESS_CONSUMER_POLLING_INTERVAL} #### [`PAPERLESS_CONSUMER_POLLING_INTERVAL=<num>`](#PAPERLESS_CONSUMER_POLLING_INTERVAL) {#PAPERLESS_CONSUMER_POLLING_INTERVAL}
: If paperless won't find documents added to your consume folder, it : Configures how the consumer detects new files in the consumption directory.
might not be able to automatically detect filesystem changes. In
that case, specify a polling interval in seconds here, which will
then cause paperless to periodically check your consumption
directory for changes. This will also disable listening for file
system changes with `inotify`.
Defaults to 0, which disables polling and uses filesystem When set to `0` (default), paperless uses native filesystem notifications for efficient, immediate detection of new files.
notifications.
When set to a positive number, paperless polls the consumption directory at that interval in seconds. Use polling for network filesystems (NFS, SMB/CIFS) where native notifications may not work reliably.
Defaults to 0.
#### [`PAPERLESS_CONSUMER_STABILITY_DELAY=<num>`](#PAPERLESS_CONSUMER_STABILITY_DELAY) {#PAPERLESS_CONSUMER_STABILITY_DELAY} #### [`PAPERLESS_CONSUMER_STABILITY_DELAY=<num>`](#PAPERLESS_CONSUMER_STABILITY_DELAY) {#PAPERLESS_CONSUMER_STABILITY_DELAY}
: Once a file has been detected in the consume folder, it must remain unchanged for this : Sets the time in seconds that a file must remain unchanged (same size and modification time) before paperless will begin consuming it.
many seconds before consumption will start on it. If the file is modified, its size changes
or the watching detects any other change on it, the timer will restart.
Defaults to 5. Increase this value if you experience issues with files being consumed before they are fully written, particularly on slower network storage or
with certain scanner quirks
Defaults to 5.0 seconds.
## Workflow webhooks ## Workflow webhooks

View File

@@ -9,7 +9,6 @@ native OS notifications and polling fallback.
from __future__ import annotations from __future__ import annotations
import logging import logging
import re
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from threading import Event from threading import Event
@@ -57,7 +56,7 @@ class TrackedFile:
self.last_mtime = stat.st_mtime self.last_mtime = stat.st_mtime
self.last_size = stat.st_size self.last_size = stat.st_size
return True return True
except (FileNotFoundError, PermissionError): except (FileNotFoundError, PermissionError, OSError):
return False return False
def is_unchanged(self) -> bool: def is_unchanged(self) -> bool:
@@ -68,7 +67,7 @@ class TrackedFile:
try: try:
stat = self.path.stat() stat = self.path.stat()
return stat.st_mtime == self.last_mtime and stat.st_size == self.last_size return stat.st_mtime == self.last_mtime and stat.st_size == self.last_size
except (FileNotFoundError, PermissionError): except (FileNotFoundError, PermissionError, OSError):
return False return False
@@ -138,7 +137,7 @@ class FileStabilityTracker:
to_remove: list[Path] = [] to_remove: list[Path] = []
to_yield: list[Path] = [] to_yield: list[Path] = []
for path, tracked in self._tracked.items(): for path, tracked in list(self._tracked.items()):
time_since_event = current_time - tracked.last_event_time time_since_event = current_time - tracked.last_event_time
if time_since_event < self.stability_delay: if time_since_event < self.stability_delay:
@@ -165,7 +164,7 @@ class FileStabilityTracker:
# Not a regular file (directory, symlink, etc.) # Not a regular file (directory, symlink, etc.)
to_remove.append(path) to_remove.append(path)
logger.debug(f"Path is not a regular file: {path}") logger.debug(f"Path is not a regular file: {path}")
except (PermissionError, FileNotFoundError) as e: except (PermissionError, OSError) as e:
logger.warning(f"Cannot access {path}: {e}") logger.warning(f"Cannot access {path}: {e}")
to_remove.append(path) to_remove.append(path)
@@ -190,34 +189,37 @@ class FileStabilityTracker:
class ConsumerFilter(DefaultFilter): class ConsumerFilter(DefaultFilter):
""" """
Custom filter for the document consumer. Filter for watchfiles that accepts only supported document types
and ignores system files/directories.
Filters files based on: Extends DefaultFilter leveraging its built-in filtering:
- Supported file extensions - `ignore_dirs`: Directory names to ignore (and all their contents)
- User-configured ignore patterns (regex) - `ignore_entity_patterns`: Regex patterns matched against filename/dirname only
- Default ignore patterns for common system files
We add custom logic for file extension filtering (only accept supported
document types), which the library doesn't provide.
""" """
# Default regex patterns to ignore (matched against filename only) # Regex patterns for files to always ignore (matched against filename only)
DEFAULT_IGNORE_PATTERNS: Final[frozenset[str]] = frozenset( # These are passed to DefaultFilter.ignore_entity_patterns
{ DEFAULT_IGNORE_PATTERNS: Final[tuple[str, ...]] = (
r"^\.DS_Store$", r"^\.DS_Store$",
r"^\.DS_STORE$", r"^\.DS_STORE$",
r"^\._.*", r"^\._.*",
r"^desktop\.ini$", r"^desktop\.ini$",
r"^Thumbs\.db$", r"^Thumbs\.db$",
},
) )
# Directories to always ignore (matched by name via DefaultFilter) # Directories to always ignore (passed to DefaultFilter.ignore_dirs)
# These are matched by directory name, not full path
DEFAULT_IGNORE_DIRS: Final[tuple[str, ...]] = ( DEFAULT_IGNORE_DIRS: Final[tuple[str, ...]] = (
".stfolder", ".stfolder", # Syncthing
".stversions", ".stversions", # Syncthing
".localized", ".localized", # macOS
"@eaDir", "@eaDir", # Synology NAS
".Spotlight-V100", ".Spotlight-V100", # macOS
".Trashes", ".Trashes", # macOS
"__MACOSX", "__MACOSX", # macOS archive artifacts
) )
def __init__( def __init__(
@@ -225,38 +227,37 @@ class ConsumerFilter(DefaultFilter):
*, *,
supported_extensions: frozenset[str] | None = None, supported_extensions: frozenset[str] | None = None,
ignore_patterns: list[str] | None = None, ignore_patterns: list[str] | None = None,
consumption_dir: Path | None = None, ignore_dirs: list[str] | None = None,
) -> None: ) -> None:
""" """
Initialize the consumer filter. Initialize the consumer filter.
Args: Args:
supported_extensions: Set of supported file extensions (e.g., {".pdf", ".png"}). supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}).
If None, uses get_supported_file_extensions(). If None, uses get_supported_file_extensions().
ignore_patterns: Additional regex patterns to ignore (matched against filename). ignore_patterns: Additional regex patterns to ignore (matched against filename).
consumption_dir: Base consumption directory (unused, kept for API compatibility). ignore_dirs: Additional directory names to ignore (merged with defaults).
""" """
# Combine default and user patterns
all_patterns = set(self.DEFAULT_IGNORE_PATTERNS)
if ignore_patterns:
all_patterns.update(ignore_patterns)
# Compile all patterns
self._ignore_regexes: list[re.Pattern[str]] = [
re.compile(pattern) for pattern in all_patterns
]
# Get supported extensions # Get supported extensions
if supported_extensions is None: if supported_extensions is None:
supported_extensions = frozenset(get_supported_file_extensions()) supported_extensions = frozenset(get_supported_file_extensions())
self._supported_extensions = supported_extensions self._supported_extensions = supported_extensions
# Call parent with directory ignore list # Combine default and user patterns
# DefaultFilter.ignore_dirs matches directory names, not full paths all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
if ignore_patterns:
all_patterns.extend(ignore_patterns)
# Combine default and user ignore_dirs
all_ignore_dirs: list[str] = list(self.DEFAULT_IGNORE_DIRS)
if ignore_dirs:
all_ignore_dirs.extend(ignore_dirs)
# Let DefaultFilter handle all the pattern and directory filtering
super().__init__( super().__init__(
ignore_dirs=self.DEFAULT_IGNORE_DIRS, ignore_dirs=tuple(all_ignore_dirs),
ignore_entity_patterns=None, ignore_entity_patterns=tuple(all_patterns),
ignore_paths=None, ignore_paths=(),
) )
def __call__(self, change: Change, path: str) -> bool: def __call__(self, change: Change, path: str) -> bool:
@@ -264,39 +265,32 @@ class ConsumerFilter(DefaultFilter):
Filter function for watchfiles. Filter function for watchfiles.
Returns True if the path should be watched, False to ignore. Returns True if the path should be watched, False to ignore.
The parent DefaultFilter handles:
- Hidden files/directories (starting with .)
- Directories in ignore_dirs
- Files/directories matching ignore_entity_patterns
We additionally filter files by extension.
""" """
# Let parent filter handle directory ignoring and basic checks # Let parent filter handle directory ignoring and pattern matching
if not super().__call__(change, path): if not super().__call__(change, path):
return False return False
path_obj = Path(path) path_obj = Path(path)
# For directories, parent filter already handled ignore_dirs # For directories, parent filter already handled everything
if path_obj.is_dir(): if path_obj.is_dir():
return True return True
# For files, check extension # For files, check extension
if not self._has_supported_extension(path_obj): return self._has_supported_extension(path_obj)
return False
# Check filename against ignore patterns
return not self._matches_ignore_pattern(path_obj.name)
def _has_supported_extension(self, path: Path) -> bool: def _has_supported_extension(self, path: Path) -> bool:
"""Check if the file has a supported extension.""" """Check if the file has a supported extension."""
suffix = path.suffix.lower() suffix = path.suffix.lower()
return suffix in self._supported_extensions return suffix in self._supported_extensions
def _matches_ignore_pattern(self, filename: str) -> bool:
"""Check if the filename matches any ignore pattern."""
for regex in self._ignore_regexes:
if regex.match(filename):
logger.debug(
f"Filename {filename} matched ignore pattern {regex.pattern}",
)
return True
return False
def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]: def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
""" """
@@ -338,7 +332,7 @@ def _consume_file(
if not filepath.is_file(): if not filepath.is_file():
logger.debug(f"Not consuming {filepath}: not a file or doesn't exist") logger.debug(f"Not consuming {filepath}: not a file or doesn't exist")
return return
except (PermissionError, FileNotFoundError) as e: except (PermissionError, OSError) as e:
logger.warning(f"Not consuming {filepath}: {e}") logger.warning(f"Not consuming {filepath}: {e}")
return return
@@ -347,7 +341,7 @@ def _consume_file(
if subdirs_as_tags: if subdirs_as_tags:
try: try:
tag_ids = _tags_from_path(filepath, consumption_dir) tag_ids = _tags_from_path(filepath, consumption_dir)
except Exception: # pragma: nocover except Exception:
logger.exception(f"Error creating tags from path for {filepath}") logger.exception(f"Error creating tags from path for {filepath}")
# Queue for consumption # Queue for consumption
@@ -404,7 +398,7 @@ class Command(BaseCommand):
# Resolve consumption directory # Resolve consumption directory
directory = options.get("directory") directory = options.get("directory")
if not directory: if not directory:
directory = settings.CONSUMPTION_DIR directory = getattr(settings, "CONSUMPTION_DIR", None)
if not directory: if not directory:
raise CommandError("CONSUMPTION_DIR is not configured") raise CommandError("CONSUMPTION_DIR is not configured")
@@ -425,13 +419,14 @@ class Command(BaseCommand):
polling_interval: float = settings.CONSUMER_POLLING_INTERVAL polling_interval: float = settings.CONSUMER_POLLING_INTERVAL
stability_delay: float = settings.CONSUMER_STABILITY_DELAY stability_delay: float = settings.CONSUMER_STABILITY_DELAY
ignore_patterns: list[str] = settings.CONSUMER_IGNORE_PATTERNS ignore_patterns: list[str] = settings.CONSUMER_IGNORE_PATTERNS
ignore_dirs: list[str] = settings.CONSUMER_IGNORE_DIRS
is_testing: bool = options.get("testing", False) is_testing: bool = options.get("testing", False)
is_oneshot: bool = options.get("oneshot", False) is_oneshot: bool = options.get("oneshot", False)
# Create filter # Create filter
consumer_filter = ConsumerFilter( consumer_filter = ConsumerFilter(
ignore_patterns=ignore_patterns, ignore_patterns=ignore_patterns,
consumption_dir=directory, ignore_dirs=ignore_dirs,
) )
# Process existing files # Process existing files
@@ -559,10 +554,10 @@ class Command(BaseCommand):
elif is_testing: elif is_testing:
# In testing, use short timeout to check stop flag # In testing, use short timeout to check stop flag
timeout_ms = testing_timeout_ms timeout_ms = testing_timeout_ms
else: # pragma: nocover else:
# No pending files, wait indefinitely # No pending files, wait indefinitely
timeout_ms = 0 timeout_ms = 0
except KeyboardInterrupt: # pragma: nocover except KeyboardInterrupt:
logger.info("Received interrupt, stopping consumer") logger.info("Received interrupt, stopping consumer")
self.stop_flag.set() self.stop_flag.set()

View File

@@ -46,9 +46,6 @@ if TYPE_CHECKING:
from pytest_mock import MockerFixture from pytest_mock import MockerFixture
# -- Fixtures --
@pytest.fixture @pytest.fixture
def stability_tracker() -> FileStabilityTracker: def stability_tracker() -> FileStabilityTracker:
"""Create a FileStabilityTracker with a short delay for testing.""" """Create a FileStabilityTracker with a short delay for testing."""
@@ -355,6 +352,28 @@ class TestConsumerFilter:
for pattern in ConsumerFilter.DEFAULT_IGNORE_PATTERNS: for pattern in ConsumerFilter.DEFAULT_IGNORE_PATTERNS:
re.compile(pattern) re.compile(pattern)
def test_custom_ignore_dirs(self, tmp_path: Path) -> None:
"""Test filter respects custom ignore_dirs."""
filter_obj = ConsumerFilter(
supported_extensions=frozenset({".pdf"}),
ignore_dirs=["custom_ignored_dir"],
)
# Custom ignored directory should be rejected
custom_dir = tmp_path / "custom_ignored_dir"
custom_dir.mkdir()
assert filter_obj(Change.added, str(custom_dir)) is False
# Normal directory should be accepted
normal_dir = tmp_path / "normal_dir"
normal_dir.mkdir()
assert filter_obj(Change.added, str(normal_dir)) is True
# Default ignored directories should still be ignored
stfolder = tmp_path / ".stfolder"
stfolder.mkdir()
assert filter_obj(Change.added, str(stfolder)) is False
class TestConsumerFilterDefaults: class TestConsumerFilterDefaults:
"""Tests for ConsumerFilter with default settings.""" """Tests for ConsumerFilter with default settings."""
@@ -617,6 +636,8 @@ class ConsumerThread(Thread):
def run(self) -> None: def run(self) -> None:
try: try:
# Use override_settings to avoid polluting global settings
# which would affect other tests running on the same worker
with override_settings( with override_settings(
SCRATCH_DIR=self.scratch_dir, SCRATCH_DIR=self.scratch_dir,
CONSUMER_RECURSIVE=self.recursive, CONSUMER_RECURSIVE=self.recursive,
@@ -633,8 +654,9 @@ class ConsumerThread(Thread):
except Exception as e: except Exception as e:
self.exception = e self.exception = e
finally: finally:
Tag.objects.all().delete()
# Close database connections created in this thread # Close database connections created in this thread
# Important: Do not perform any database operations here (like Tag cleanup)
# as they create new connections that won't be properly closed
db.connections.close_all() db.connections.close_all()
def stop(self) -> None: def stop(self) -> None:
@@ -672,7 +694,7 @@ def start_consumer(
finally: finally:
# Cleanup all threads that were started # Cleanup all threads that were started
for thread in threads: for thread in threads:
thread.stop() thread.stop_and_wait()
failed_threads = [] failed_threads = []
for thread in threads: for thread in threads:
@@ -680,9 +702,11 @@ def start_consumer(
if thread.is_alive(): if thread.is_alive():
failed_threads.append(thread) failed_threads.append(thread)
# Clean up any Tags created by threads # Clean up any Tags created by threads (they bypass test transaction isolation)
Tag.objects.all().delete() Tag.objects.all().delete()
db.connections.close_all()
if failed_threads: if failed_threads:
pytest.fail( pytest.fail(
f"{len(failed_threads)} consumer thread(s) did not stop within timeout", f"{len(failed_threads)} consumer thread(s) did not stop within timeout",
@@ -799,6 +823,8 @@ class TestCommandWatch:
assert thread.is_alive() assert thread.is_alive()
finally: finally:
thread.stop_and_wait(timeout=5.0) thread.stop_and_wait(timeout=5.0)
# Clean up any Tags created by the thread
Tag.objects.all().delete()
assert not thread.is_alive() assert not thread.is_alive()
@@ -860,8 +886,15 @@ class TestCommandWatchRecursive:
sample_pdf: Path, sample_pdf: Path,
mock_consume_file_delay: MagicMock, mock_consume_file_delay: MagicMock,
start_consumer: Callable[..., ConsumerThread], start_consumer: Callable[..., ConsumerThread],
mocker: MockerFixture,
) -> None: ) -> None:
"""Test subdirs_as_tags creates tags from directory names.""" """Test subdirs_as_tags creates tags from directory names."""
# Mock _tags_from_path to avoid database operations in the consumer thread
mock_tags = mocker.patch(
"documents.management.commands.document_consumer._tags_from_path",
return_value=[1, 2],
)
subdir = consumption_dir / "Invoices" / "2024" subdir = consumption_dir / "Invoices" / "2024"
subdir.mkdir(parents=True) subdir.mkdir(parents=True)
@@ -875,6 +908,7 @@ class TestCommandWatchRecursive:
raise thread.exception raise thread.exception
mock_consume_file_delay.delay.assert_called() mock_consume_file_delay.delay.assert_called()
mock_tags.assert_called()
call_args = mock_consume_file_delay.delay.call_args call_args = mock_consume_file_delay.delay.call_args
overrides = call_args[0][1] overrides = call_args[0][1]
assert overrides.tag_ids is not None assert overrides.tag_ids is not None
@@ -934,3 +968,5 @@ class TestCommandWatchEdgeCases:
assert thread.is_alive() assert thread.is_alive()
finally: finally:
thread.stop_and_wait(timeout=5.0) thread.stop_and_wait(timeout=5.0)
# Clean up any Tags created by the thread
Tag.objects.all().delete()

View File

@@ -1019,7 +1019,7 @@ CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES
CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE") CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
# Ignore regex patterns, relative to PAPERLESS_CONSUMPTION_DIR # Ignore regex patterns, matched against filename only
CONSUMER_IGNORE_PATTERNS = list( CONSUMER_IGNORE_PATTERNS = list(
json.loads( json.loads(
os.getenv( os.getenv(
@@ -1029,6 +1029,16 @@ CONSUMER_IGNORE_PATTERNS = list(
), ),
) )
# Directories to always ignore. These are matched by directory name, not full path
CONSUMER_IGNORE_DIRS = list(
json.loads(
os.getenv(
"PAPERLESS_CONSUMER_IGNORE_DIRS",
json.dumps([]),
),
),
)
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS") CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean( CONSUMER_ENABLE_BARCODES: Final[bool] = __get_boolean(