mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-02-01 23:19:00 -06:00
Just curious
This commit is contained in:
@@ -28,7 +28,6 @@ from documents.data_models import ConsumableDocument
|
|||||||
from documents.data_models import DocumentMetadataOverrides
|
from documents.data_models import DocumentMetadataOverrides
|
||||||
from documents.data_models import DocumentSource
|
from documents.data_models import DocumentSource
|
||||||
from documents.models import Tag
|
from documents.models import Tag
|
||||||
from documents.parsers import get_supported_file_extensions
|
|
||||||
from documents.tasks import consume_file
|
from documents.tasks import consume_file
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -216,7 +215,6 @@ class ConsumerFilter(DefaultFilter):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
supported_extensions: frozenset[str] | None = None,
|
|
||||||
ignore_patterns: list[str] | None = None,
|
ignore_patterns: list[str] | None = None,
|
||||||
ignore_dirs: list[str] | None = None,
|
ignore_dirs: list[str] | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -224,16 +222,9 @@ class ConsumerFilter(DefaultFilter):
|
|||||||
Initialize the consumer filter.
|
Initialize the consumer filter.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}).
|
|
||||||
If None, uses get_supported_file_extensions().
|
|
||||||
ignore_patterns: Additional regex patterns to ignore (matched against filename).
|
ignore_patterns: Additional regex patterns to ignore (matched against filename).
|
||||||
ignore_dirs: Additional directory names to ignore (merged with defaults).
|
ignore_dirs: Additional directory names to ignore (merged with defaults).
|
||||||
"""
|
"""
|
||||||
# Get supported extensions
|
|
||||||
if supported_extensions is None:
|
|
||||||
supported_extensions = frozenset(get_supported_file_extensions())
|
|
||||||
self._supported_extensions = supported_extensions
|
|
||||||
|
|
||||||
# Combine default and user patterns
|
# Combine default and user patterns
|
||||||
all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
|
all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
|
||||||
if ignore_patterns:
|
if ignore_patterns:
|
||||||
@@ -261,8 +252,6 @@ class ConsumerFilter(DefaultFilter):
|
|||||||
- Hidden files/directories (starting with .)
|
- Hidden files/directories (starting with .)
|
||||||
- Directories in ignore_dirs
|
- Directories in ignore_dirs
|
||||||
- Files/directories matching ignore_entity_patterns
|
- Files/directories matching ignore_entity_patterns
|
||||||
|
|
||||||
We additionally filter files by extension.
|
|
||||||
"""
|
"""
|
||||||
# Let parent filter handle directory ignoring and pattern matching
|
# Let parent filter handle directory ignoring and pattern matching
|
||||||
if not super().__call__(change, path):
|
if not super().__call__(change, path):
|
||||||
@@ -274,14 +263,6 @@ class ConsumerFilter(DefaultFilter):
|
|||||||
if path_obj.is_dir():
|
if path_obj.is_dir():
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# For files, check extension
|
|
||||||
return self._has_supported_extension(path_obj)
|
|
||||||
|
|
||||||
def _has_supported_extension(self, path: Path) -> bool:
|
|
||||||
"""Check if the file has a supported extension."""
|
|
||||||
suffix = path.suffix.lower()
|
|
||||||
return suffix in self._supported_extensions
|
|
||||||
|
|
||||||
|
|
||||||
def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
|
def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -85,34 +85,6 @@ def get_default_file_extension(mime_type: str) -> str:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=8)
|
|
||||||
def is_file_ext_supported(ext: str) -> bool:
|
|
||||||
"""
|
|
||||||
Returns True if the file extension is supported, False otherwise
|
|
||||||
TODO: Investigate why this really exists, why not use mimetype
|
|
||||||
"""
|
|
||||||
if ext:
|
|
||||||
return ext.lower() in get_supported_file_extensions()
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_supported_file_extensions() -> set[str]:
|
|
||||||
extensions = set()
|
|
||||||
for response in document_consumer_declaration.send(None):
|
|
||||||
parser_declaration = response[1]
|
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
|
||||||
|
|
||||||
for mime_type in supported_mime_types:
|
|
||||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
|
||||||
# Python's stdlib might be behind, so also add what the parser
|
|
||||||
# says is the default extension
|
|
||||||
# This makes image/webp supported on Python < 3.11
|
|
||||||
extensions.add(supported_mime_types[mime_type])
|
|
||||||
|
|
||||||
return extensions
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
||||||
"""
|
"""
|
||||||
Returns the best parser (by weight) for the given mimetype or
|
Returns the best parser (by weight) for the given mimetype or
|
||||||
|
|||||||
@@ -90,7 +90,6 @@ def sample_pdf(tmp_path: Path) -> Path:
|
|||||||
def consumer_filter() -> ConsumerFilter:
|
def consumer_filter() -> ConsumerFilter:
|
||||||
"""Create a ConsumerFilter for testing."""
|
"""Create a ConsumerFilter for testing."""
|
||||||
return ConsumerFilter(
|
return ConsumerFilter(
|
||||||
supported_extensions=frozenset({".pdf", ".png", ".jpg"}),
|
|
||||||
ignore_patterns=[r"^custom_ignore"],
|
ignore_patterns=[r"^custom_ignore"],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -105,15 +104,6 @@ def mock_consume_file_delay(mocker: MockerFixture) -> MagicMock:
|
|||||||
return mock_task
|
return mock_task
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_supported_extensions(mocker: MockerFixture) -> MagicMock:
|
|
||||||
"""Mock get_supported_file_extensions to return only .pdf."""
|
|
||||||
return mocker.patch(
|
|
||||||
"documents.management.commands.document_consumer.get_supported_file_extensions",
|
|
||||||
return_value={".pdf"},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def wait_for_mock_call(
|
def wait_for_mock_call(
|
||||||
mock_obj: MagicMock,
|
mock_obj: MagicMock,
|
||||||
timeout_s: float = 5.0,
|
timeout_s: float = 5.0,
|
||||||
@@ -395,7 +385,6 @@ class TestConsumerFilter:
|
|||||||
def test_custom_ignore_dirs(self, tmp_path: Path) -> None:
|
def test_custom_ignore_dirs(self, tmp_path: Path) -> None:
|
||||||
"""Test filter respects custom ignore_dirs."""
|
"""Test filter respects custom ignore_dirs."""
|
||||||
filter_obj = ConsumerFilter(
|
filter_obj = ConsumerFilter(
|
||||||
supported_extensions=frozenset({".pdf"}),
|
|
||||||
ignore_dirs=["custom_ignored_dir"],
|
ignore_dirs=["custom_ignored_dir"],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -415,25 +404,6 @@ class TestConsumerFilter:
|
|||||||
assert filter_obj(Change.added, str(stfolder)) is False
|
assert filter_obj(Change.added, str(stfolder)) is False
|
||||||
|
|
||||||
|
|
||||||
class TestConsumerFilterDefaults:
|
|
||||||
"""Tests for ConsumerFilter with default settings."""
|
|
||||||
|
|
||||||
def test_filter_with_mocked_extensions(
|
|
||||||
self,
|
|
||||||
tmp_path: Path,
|
|
||||||
mocker: MockerFixture,
|
|
||||||
) -> None:
|
|
||||||
"""Test filter works when using mocked extensions from parser."""
|
|
||||||
mocker.patch(
|
|
||||||
"documents.management.commands.document_consumer.get_supported_file_extensions",
|
|
||||||
return_value={".pdf", ".png"},
|
|
||||||
)
|
|
||||||
filter_obj = ConsumerFilter()
|
|
||||||
test_file = tmp_path / "document.pdf"
|
|
||||||
test_file.touch()
|
|
||||||
assert filter_obj(Change.added, str(test_file)) is True
|
|
||||||
|
|
||||||
|
|
||||||
class TestConsumeFile:
|
class TestConsumeFile:
|
||||||
"""Tests for the _consume_file function."""
|
"""Tests for the _consume_file function."""
|
||||||
|
|
||||||
@@ -605,7 +575,6 @@ class TestCommandValidation:
|
|||||||
cmd.handle(directory=str(sample_pdf), oneshot=True, testing=False)
|
cmd.handle(directory=str(sample_pdf), oneshot=True, testing=False)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("mock_supported_extensions")
|
|
||||||
class TestCommandOneshot:
|
class TestCommandOneshot:
|
||||||
"""Tests for oneshot mode."""
|
"""Tests for oneshot mode."""
|
||||||
|
|
||||||
@@ -652,25 +621,6 @@ class TestCommandOneshot:
|
|||||||
|
|
||||||
mock_consume_file_delay.delay.assert_called_once()
|
mock_consume_file_delay.delay.assert_called_once()
|
||||||
|
|
||||||
def test_ignores_unsupported_extensions(
|
|
||||||
self,
|
|
||||||
consumption_dir: Path,
|
|
||||||
scratch_dir: Path,
|
|
||||||
mock_consume_file_delay: MagicMock,
|
|
||||||
settings: SettingsWrapper,
|
|
||||||
) -> None:
|
|
||||||
"""Test oneshot mode ignores unsupported file extensions."""
|
|
||||||
target = consumption_dir / "document.xyz"
|
|
||||||
target.write_bytes(b"content")
|
|
||||||
|
|
||||||
settings.SCRATCH_DIR = scratch_dir
|
|
||||||
settings.CONSUMER_IGNORE_PATTERNS = []
|
|
||||||
|
|
||||||
cmd = Command()
|
|
||||||
cmd.handle(directory=str(consumption_dir), oneshot=True, testing=False)
|
|
||||||
|
|
||||||
mock_consume_file_delay.delay.assert_not_called()
|
|
||||||
|
|
||||||
|
|
||||||
class ConsumerThread(Thread):
|
class ConsumerThread(Thread):
|
||||||
"""Thread wrapper for running the consumer command with proper cleanup."""
|
"""Thread wrapper for running the consumer command with proper cleanup."""
|
||||||
@@ -739,7 +689,6 @@ class ConsumerThread(Thread):
|
|||||||
def start_consumer(
|
def start_consumer(
|
||||||
consumption_dir: Path,
|
consumption_dir: Path,
|
||||||
scratch_dir: Path,
|
scratch_dir: Path,
|
||||||
mock_supported_extensions: MagicMock,
|
|
||||||
) -> Generator[Callable[..., ConsumerThread], None, None]:
|
) -> Generator[Callable[..., ConsumerThread], None, None]:
|
||||||
"""Start a consumer thread and ensure cleanup."""
|
"""Start a consumer thread and ensure cleanup."""
|
||||||
threads: list[ConsumerThread] = []
|
threads: list[ConsumerThread] = []
|
||||||
@@ -875,7 +824,6 @@ class TestCommandWatch:
|
|||||||
assert call_args.original_file.name == "valid.pdf"
|
assert call_args.original_file.name == "valid.pdf"
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
@pytest.mark.usefixtures("mock_supported_extensions")
|
|
||||||
def test_stop_flag_stops_consumer(
|
def test_stop_flag_stops_consumer(
|
||||||
self,
|
self,
|
||||||
consumption_dir: Path,
|
consumption_dir: Path,
|
||||||
@@ -1017,7 +965,6 @@ class TestCommandWatchEdgeCases:
|
|||||||
|
|
||||||
mock_consume_file_delay.delay.assert_not_called()
|
mock_consume_file_delay.delay.assert_not_called()
|
||||||
|
|
||||||
@pytest.mark.usefixtures("mock_supported_extensions")
|
|
||||||
def test_handles_task_exception(
|
def test_handles_task_exception(
|
||||||
self,
|
self,
|
||||||
consumption_dir: Path,
|
consumption_dir: Path,
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ from django.test import override_settings
|
|||||||
|
|
||||||
from documents.parsers import get_default_file_extension
|
from documents.parsers import get_default_file_extension
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
from documents.parsers import get_supported_file_extensions
|
|
||||||
from documents.parsers import is_file_ext_supported
|
from documents.parsers import is_file_ext_supported
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||||
from paperless_text.parsers import TextDocumentParser
|
from paperless_text.parsers import TextDocumentParser
|
||||||
@@ -145,10 +144,7 @@ class TestParserAvailability(TestCase):
|
|||||||
("image/webp", ".webp"),
|
("image/webp", ".webp"),
|
||||||
]
|
]
|
||||||
|
|
||||||
supported_exts = get_supported_file_extensions()
|
|
||||||
|
|
||||||
for mime_type, ext in supported_mimes_and_exts:
|
for mime_type, ext in supported_mimes_and_exts:
|
||||||
self.assertIn(ext, supported_exts)
|
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||||
@@ -169,10 +165,7 @@ class TestParserAvailability(TestCase):
|
|||||||
("text/csv", ".csv"),
|
("text/csv", ".csv"),
|
||||||
]
|
]
|
||||||
|
|
||||||
supported_exts = get_supported_file_extensions()
|
|
||||||
|
|
||||||
for mime_type, ext in supported_mimes_and_exts:
|
for mime_type, ext in supported_mimes_and_exts:
|
||||||
self.assertIn(ext, supported_exts)
|
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||||
@@ -202,10 +195,8 @@ class TestParserAvailability(TestCase):
|
|||||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
||||||
app = apps.get_app_config("paperless_tika")
|
app = apps.get_app_config("paperless_tika")
|
||||||
app.ready()
|
app.ready()
|
||||||
supported_exts = get_supported_file_extensions()
|
|
||||||
|
|
||||||
for mime_type, ext in supported_mimes_and_exts:
|
for mime_type, ext in supported_mimes_and_exts:
|
||||||
self.assertIn(ext, supported_exts)
|
|
||||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||||
|
|||||||
Reference in New Issue
Block a user