diff --git a/docs/configuration.md b/docs/configuration.md index 27b3f1b28..aeea1c7a4 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -999,13 +999,20 @@ within your documents. `PAPERLESS_CONSUMER_IGNORE_PATTERNS=` : By default, paperless ignores certain files and folders in the -consumption directory, such as system files created by the Mac OS. +consumption directory, such as system files created by the Mac OS +or hidden folders some tools use to store data. This can be adjusted by configuring a custom json array with patterns to exclude. + For example, `.DS_STORE/*` will ignore any files found in a folder + named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf` + + A pattern like `._*` will ignore anything starting with `._`, including: + `._foo.pdf` and `._bar/foo.pdf` + Defaults to - `[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]`. + `[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]`. ## Binaries diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 9107d574a..c3f6bbed4 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,5 +1,6 @@ import logging import os +from fnmatch import filter from pathlib import Path from pathlib import PurePath from threading import Event @@ -7,6 +8,7 @@ from threading import Thread from time import monotonic from time import sleep from typing import Final +from typing import Set from django.conf import settings from django.core.management.base import BaseCommand @@ -25,15 +27,15 @@ except ImportError: # pragma: nocover logger = logging.getLogger("paperless.management.consumer") -def _tags_from_path(filepath): - """Walk up the directory tree from filepath to CONSUMPTION_DIR - and get or create Tag IDs for every directory. +def _tags_from_path(filepath) -> Set[Tag]: + """ + Walk up the directory tree from filepath to CONSUMPTION_DIR + and get or create Tag IDs for every directory. + + Returns set of Tag models """ - normalized_consumption_dir = os.path.abspath( - os.path.normpath(settings.CONSUMPTION_DIR), - ) tag_ids = set() - path_parts = Path(filepath).relative_to(normalized_consumption_dir).parent.parts + path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts for part in path_parts: tag_ids.add( Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk, @@ -43,14 +45,41 @@ def _tags_from_path(filepath): def _is_ignored(filepath: str) -> bool: - normalized_consumption_dir = os.path.abspath( - os.path.normpath(settings.CONSUMPTION_DIR), + """ + Checks if the given file should be ignored, based on configured + patterns. + + Returns True if the file is ignored, False otherwise + """ + filepath = os.path.abspath( + os.path.normpath(filepath), ) - filepath_relative = PurePath(filepath).relative_to(normalized_consumption_dir) - return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS) + + # Trim out the consume directory, leaving only filename and it's + # path relative to the consume directory + filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR) + + # March through the components of the path, including directories and the filename + # looking for anything matching + # foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf) + parts = [] + for part in filepath_relative.parts: + # If the part is not the name (ie, it's a dir) + # Need to append the trailing slash or fnmatch doesn't match + # fnmatch("dir", "dir/*") == False + # fnmatch("dir/", "dir/*") == True + if part != filepath_relative.name: + part = part + "/" + parts.append(part) + + for pattern in settings.CONSUMER_IGNORE_PATTERNS: + if len(filter(parts, pattern)): + return True + + return False -def _consume(filepath): +def _consume(filepath: str) -> None: if os.path.isdir(filepath) or _is_ignored(filepath): return @@ -103,7 +132,13 @@ def _consume(filepath): logger.exception("Error while consuming document") -def _consume_wait_unmodified(file): +def _consume_wait_unmodified(file: str) -> None: + """ + Waits for the given file to appear unmodified based on file size + and modification time. Will wait a configured number of seconds + and retry a configured number of times before either consuming or + giving up + """ if _is_ignored(file): return diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index 822a7ed07..3db8de034 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -247,22 +247,85 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase): def test_is_ignored(self): test_paths = [ - (os.path.join(self.dirs.consumption_dir, "foo.pdf"), False), - (os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False), - (os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True), - ( - os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"), - True, - ), - (os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True), - (os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True), - (os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False), + { + "path": os.path.join(self.dirs.consumption_dir, "foo.pdf"), + "ignore": False, + }, + { + "path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), + "ignore": False, + }, + { + "path": os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), + "ignore": True, + }, + { + "path": os.path.join( + self.dirs.consumption_dir, + "foo", + ".DS_STORE", + "bar.pdf", + ), + "ignore": True, + }, + { + "path": os.path.join( + self.dirs.consumption_dir, + ".DS_STORE", + "foo", + "bar.pdf", + ), + "ignore": True, + }, + { + "path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), + "ignore": True, + }, + { + "path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"), + "ignore": False, + }, + { + "path": os.path.join( + self.dirs.consumption_dir, + ".stversions", + "foo.pdf", + ), + "ignore": True, + }, + { + "path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"), + "ignore": False, + }, + { + "path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"), + "ignore": True, + }, + { + "path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"), + "ignore": False, + }, + { + "path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), + "ignore": True, + }, + { + "path": os.path.join( + self.dirs.consumption_dir, + "@eaDir", + "SYNO@.fileindexdb", + "_1jk.fnm", + ), + "ignore": True, + }, ] - for file_path, expected_ignored in test_paths: + for test_setup in test_paths: + filepath = test_setup["path"] + expected_ignored_result = test_setup["ignore"] self.assertEqual( - expected_ignored, - document_consumer._is_ignored(file_path), - f'_is_ignored("{file_path}") != {expected_ignored}', + expected_ignored_result, + document_consumer._is_ignored(filepath), + f'_is_ignored("{filepath}") != {expected_ignored_result}', ) @mock.patch("documents.management.commands.document_consumer.open") diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 5cac5e621..409579bda 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -673,7 +673,7 @@ CONSUMER_IGNORE_PATTERNS = list( json.loads( os.getenv( "PAPERLESS_CONSUMER_IGNORE_PATTERNS", - '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]', # noqa: E501 + '[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]', # noqa: E501 ), ), )