mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Updates ignore path filtering so files in a folder in an ignored folder will be ignored correctly
This commit is contained in:
parent
0d1a8d6d2f
commit
c08f0054da
@ -999,13 +999,20 @@ within your documents.
|
|||||||
`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`
|
`PAPERLESS_CONSUMER_IGNORE_PATTERNS=<json>`
|
||||||
|
|
||||||
: By default, paperless ignores certain files and folders in the
|
: By default, paperless ignores certain files and folders in the
|
||||||
consumption directory, such as system files created by the Mac OS.
|
consumption directory, such as system files created by the Mac OS
|
||||||
|
or hidden folders some tools use to store data.
|
||||||
|
|
||||||
This can be adjusted by configuring a custom json array with
|
This can be adjusted by configuring a custom json array with
|
||||||
patterns to exclude.
|
patterns to exclude.
|
||||||
|
|
||||||
|
For example, `.DS_STORE/*` will ignore any files found in a folder
|
||||||
|
named `.DS_STORE`, including `.DS_STORE/bar.pdf` and `foo/.DS_STORE/bar.pdf`
|
||||||
|
|
||||||
|
A pattern like `._*` will ignore anything starting with `._`, including:
|
||||||
|
`._foo.pdf` and `._bar/foo.pdf`
|
||||||
|
|
||||||
Defaults to
|
Defaults to
|
||||||
`[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]`.
|
`[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]`.
|
||||||
|
|
||||||
## Binaries
|
## Binaries
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from fnmatch import filter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from pathlib import PurePath
|
from pathlib import PurePath
|
||||||
from threading import Event
|
from threading import Event
|
||||||
@ -7,6 +8,7 @@ from threading import Thread
|
|||||||
from time import monotonic
|
from time import monotonic
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
from typing import Set
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
@ -25,15 +27,15 @@ except ImportError: # pragma: nocover
|
|||||||
logger = logging.getLogger("paperless.management.consumer")
|
logger = logging.getLogger("paperless.management.consumer")
|
||||||
|
|
||||||
|
|
||||||
def _tags_from_path(filepath):
|
def _tags_from_path(filepath) -> Set[Tag]:
|
||||||
"""Walk up the directory tree from filepath to CONSUMPTION_DIR
|
"""
|
||||||
and get or create Tag IDs for every directory.
|
Walk up the directory tree from filepath to CONSUMPTION_DIR
|
||||||
|
and get or create Tag IDs for every directory.
|
||||||
|
|
||||||
|
Returns set of Tag models
|
||||||
"""
|
"""
|
||||||
normalized_consumption_dir = os.path.abspath(
|
|
||||||
os.path.normpath(settings.CONSUMPTION_DIR),
|
|
||||||
)
|
|
||||||
tag_ids = set()
|
tag_ids = set()
|
||||||
path_parts = Path(filepath).relative_to(normalized_consumption_dir).parent.parts
|
path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
|
||||||
for part in path_parts:
|
for part in path_parts:
|
||||||
tag_ids.add(
|
tag_ids.add(
|
||||||
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
|
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
|
||||||
@ -43,14 +45,41 @@ def _tags_from_path(filepath):
|
|||||||
|
|
||||||
|
|
||||||
def _is_ignored(filepath: str) -> bool:
|
def _is_ignored(filepath: str) -> bool:
|
||||||
normalized_consumption_dir = os.path.abspath(
|
"""
|
||||||
os.path.normpath(settings.CONSUMPTION_DIR),
|
Checks if the given file should be ignored, based on configured
|
||||||
|
patterns.
|
||||||
|
|
||||||
|
Returns True if the file is ignored, False otherwise
|
||||||
|
"""
|
||||||
|
filepath = os.path.abspath(
|
||||||
|
os.path.normpath(filepath),
|
||||||
)
|
)
|
||||||
filepath_relative = PurePath(filepath).relative_to(normalized_consumption_dir)
|
|
||||||
return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
|
# Trim out the consume directory, leaving only filename and it's
|
||||||
|
# path relative to the consume directory
|
||||||
|
filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
|
||||||
|
|
||||||
|
# March through the components of the path, including directories and the filename
|
||||||
|
# looking for anything matching
|
||||||
|
# foo/bar/baz/file.pdf -> (foo, bar, baz, file.pdf)
|
||||||
|
parts = []
|
||||||
|
for part in filepath_relative.parts:
|
||||||
|
# If the part is not the name (ie, it's a dir)
|
||||||
|
# Need to append the trailing slash or fnmatch doesn't match
|
||||||
|
# fnmatch("dir", "dir/*") == False
|
||||||
|
# fnmatch("dir/", "dir/*") == True
|
||||||
|
if part != filepath_relative.name:
|
||||||
|
part = part + "/"
|
||||||
|
parts.append(part)
|
||||||
|
|
||||||
|
for pattern in settings.CONSUMER_IGNORE_PATTERNS:
|
||||||
|
if len(filter(parts, pattern)):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _consume(filepath):
|
def _consume(filepath: str) -> None:
|
||||||
if os.path.isdir(filepath) or _is_ignored(filepath):
|
if os.path.isdir(filepath) or _is_ignored(filepath):
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -103,7 +132,13 @@ def _consume(filepath):
|
|||||||
logger.exception("Error while consuming document")
|
logger.exception("Error while consuming document")
|
||||||
|
|
||||||
|
|
||||||
def _consume_wait_unmodified(file):
|
def _consume_wait_unmodified(file: str) -> None:
|
||||||
|
"""
|
||||||
|
Waits for the given file to appear unmodified based on file size
|
||||||
|
and modification time. Will wait a configured number of seconds
|
||||||
|
and retry a configured number of times before either consuming or
|
||||||
|
giving up
|
||||||
|
"""
|
||||||
if _is_ignored(file):
|
if _is_ignored(file):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -247,22 +247,85 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
|||||||
|
|
||||||
def test_is_ignored(self):
|
def test_is_ignored(self):
|
||||||
test_paths = [
|
test_paths = [
|
||||||
(os.path.join(self.dirs.consumption_dir, "foo.pdf"), False),
|
{
|
||||||
(os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"), False),
|
"path": os.path.join(self.dirs.consumption_dir, "foo.pdf"),
|
||||||
(os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"), True),
|
"ignore": False,
|
||||||
(
|
},
|
||||||
os.path.join(self.dirs.consumption_dir, "foo", ".DS_STORE", "bar.pdf"),
|
{
|
||||||
True,
|
"path": os.path.join(self.dirs.consumption_dir, "foo", "bar.pdf"),
|
||||||
),
|
"ignore": False,
|
||||||
(os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"), True),
|
},
|
||||||
(os.path.join(self.dirs.consumption_dir, "._foo.pdf"), True),
|
{
|
||||||
(os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"), False),
|
"path": os.path.join(self.dirs.consumption_dir, ".DS_STORE", "foo.pdf"),
|
||||||
|
"ignore": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(
|
||||||
|
self.dirs.consumption_dir,
|
||||||
|
"foo",
|
||||||
|
".DS_STORE",
|
||||||
|
"bar.pdf",
|
||||||
|
),
|
||||||
|
"ignore": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(
|
||||||
|
self.dirs.consumption_dir,
|
||||||
|
".DS_STORE",
|
||||||
|
"foo",
|
||||||
|
"bar.pdf",
|
||||||
|
),
|
||||||
|
"ignore": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(self.dirs.consumption_dir, ".stfolder", "foo.pdf"),
|
||||||
|
"ignore": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(self.dirs.consumption_dir, ".stfolder.pdf"),
|
||||||
|
"ignore": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(
|
||||||
|
self.dirs.consumption_dir,
|
||||||
|
".stversions",
|
||||||
|
"foo.pdf",
|
||||||
|
),
|
||||||
|
"ignore": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(self.dirs.consumption_dir, ".stversions.pdf"),
|
||||||
|
"ignore": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(self.dirs.consumption_dir, "._foo.pdf"),
|
||||||
|
"ignore": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(self.dirs.consumption_dir, "my_foo.pdf"),
|
||||||
|
"ignore": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(self.dirs.consumption_dir, "._foo", "bar.pdf"),
|
||||||
|
"ignore": True,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": os.path.join(
|
||||||
|
self.dirs.consumption_dir,
|
||||||
|
"@eaDir",
|
||||||
|
"SYNO@.fileindexdb",
|
||||||
|
"_1jk.fnm",
|
||||||
|
),
|
||||||
|
"ignore": True,
|
||||||
|
},
|
||||||
]
|
]
|
||||||
for file_path, expected_ignored in test_paths:
|
for test_setup in test_paths:
|
||||||
|
filepath = test_setup["path"]
|
||||||
|
expected_ignored_result = test_setup["ignore"]
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
expected_ignored,
|
expected_ignored_result,
|
||||||
document_consumer._is_ignored(file_path),
|
document_consumer._is_ignored(filepath),
|
||||||
f'_is_ignored("{file_path}") != {expected_ignored}',
|
f'_is_ignored("{filepath}") != {expected_ignored_result}',
|
||||||
)
|
)
|
||||||
|
|
||||||
@mock.patch("documents.management.commands.document_consumer.open")
|
@mock.patch("documents.management.commands.document_consumer.open")
|
||||||
|
@ -673,7 +673,7 @@ CONSUMER_IGNORE_PATTERNS = list(
|
|||||||
json.loads(
|
json.loads(
|
||||||
os.getenv(
|
os.getenv(
|
||||||
"PAPERLESS_CONSUMER_IGNORE_PATTERNS",
|
"PAPERLESS_CONSUMER_IGNORE_PATTERNS",
|
||||||
'[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]', # noqa: E501
|
'[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini", "@eaDir/*"]', # noqa: E501
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user