Updates ignore path filtering so files in a folder in an ignored folder will be ignored correctly

This commit is contained in:
Trenton H 2023-02-14 12:54:03 -08:00
parent 0d1a8d6d2f
commit c08f0054da
4 changed files with 135 additions and 30 deletions

View File

@ -999,13 +999,20 @@ within your documents.
`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`
: By default, paperless ignores certain files and folders in the
consumption directory, such as system files created by the Mac OS.
consumption directory, such as system files created by the Mac OS
or hidden folders some tools use to store data.
This can be adjusted by configuring a custom json array with
patterns to exclude.
For example, `.DS_STORE/*` will ignore any files found in a folder
named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
A pattern like `._*` will ignore anything starting with `._`, including:
`._foo.pdf` and `._bar/foo.pdf`
Defaults to
`[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]`.
`[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]`.
## Binaries

View File

@ -1,5 +1,6 @@
import logging
import os
from fnmatch import filter
from pathlib import Path
from pathlib import PurePath
from threading import Event
@ -7,6 +8,7 @@ from threading import Thread
from time import monotonic
from time import sleep
from typing import Final
from typing import Set
from django.conf import settings
from django.core.management.base import BaseCommand
@ -25,15 +27,15 @@ except ImportError: # pragma: nocover
logger = logging.getLogger("paperless.management.consumer")
def _tags_from_path(filepath):
"""Walk up the directory tree from filepath to CONSUMPTION_DIR
and get or create Tag IDs for every directory.
def _tags_from_path(filepath) -> Set[Tag]:
"""
Walk up the directory tree from filepath to CONSUMPTION_DIR
and get or create Tag IDs for every directory.
Returns set of Tag models
"""
normalized_consumption_dir = os.path.abspath(
os.path.normpath(settings.CONSUMPTION_DIR),
)
tag_ids = set()
path_parts = Path(filepath).relative_to(normalized_consumption_dir).parent.parts
path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
for part in path_parts:
tag_ids.add(
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
@ -43,14 +45,41 @@ def _tags_from_path(filepath):
def _is_ignored(filepath: str) -> bool:
normalized_consumption_dir = os.path.abspath(
os.path.normpath(settings.CONSUMPTION_DIR),
"""
Checks if the given file should be ignored, based on configured
patterns.
Returns True if the file is ignored, False otherwise
"""
filepath = os.path.abspath(
os.path.normpath(filepath),
)
filepath_relative = PurePath(filepath).relative_to(normalized_consumption_dir)
return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
# Trim out the consume directory, leaving only filename and it's
# path relative to the consume directory
filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
# March through the components of the path, including directories and the filename
# looking for anything matching
# foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
parts = []
for part in filepath_relative.parts:
# If the part is not the name (ie, it's a dir)
# Need to append the trailing slash or fnmatch doesn't match
# fnmatch("dir", "dir/*") == False
# fnmatch("dir/", "dir/*") == True
if part != filepath_relative.name:
part = part + "/"
parts.append(part)
for pattern in settings.CONSUMER_IGNORE_PATTERNS:
if len(filter(parts, pattern)):
return True
return False
def _consume(filepath):
def _consume(filepath: str) -> None:
if os.path.isdir(filepath) or _is_ignored(filepath):
return
@ -103,7 +132,13 @@ def _consume(filepath):
logger.exception("Error while consuming document")
def _consume_wait_unmodified(file):
def _consume_wait_unmodified(file: str) -> None:
"""
Waits for the given file to appear unmodified based on file size
and modification time. Will wait a configured number of seconds
and retry a configured number of times before either consuming or
giving up
"""
if _is_ignored(file):
return

View File

@ -247,22 +247,85 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
def test_is_ignored(self):
test_paths = [
(os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
(os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False),
(os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
(
os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"),
True,
),
(os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
(os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
(os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
{
"path": os.path.join(self.dirs.consumption_dir, "foo.pdf"),
"ignore": False,
},
{
"path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"),
"ignore": False,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"),
"ignore": True,
},
{
"path": os.path.join(
self.dirs.consumption_dir,
"foo",
".DS_STORE",
"bar.pdf",
),
"ignore": True,
},
{
"path": os.path.join(
self.dirs.consumption_dir,
".DS_STORE",
"foo",
"bar.pdf",
),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"),
"ignore": False,
},
{
"path": os.path.join(
self.dirs.consumption_dir,
".stversions",
"foo.pdf",
),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"),
"ignore": False,
},
{
"path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"),
"ignore": True,
},
{
"path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"),
"ignore": False,
},
{
"path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"),
"ignore": True,
},
{
"path": os.path.join(
self.dirs.consumption_dir,
"@eaDir",
"SYNO@.fileindexdb",
"_1jk.fnm",
),
"ignore": True,
},
]
for file_path, expected_ignored in test_paths:
for test_setup in test_paths:
filepath = test_setup["path"]
expected_ignored_result = test_setup["ignore"]
self.assertEqual(
expected_ignored,
document_consumer._is_ignored(file_path),
f'_is_ignored("{file_path}") != {expected_ignored}',
expected_ignored_result,
document_consumer._is_ignored(filepath),
f'_is_ignored("{filepath}") != {expected_ignored_result}',
)
@mock.patch("documents.management.commands.document_consumer.open")

View File

@ -673,7 +673,7 @@ CONSUMER_IGNORE_PATTERNS = list(
json.loads(
os.getenv(
"PAPERLESS_CONSUMER_IGNORE_PATTERNS",
'[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]', # noqa: E501
'[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]', # noqa: E501
),
),
)