From aa8991b4ca22b030f250f2ca2fdb913979bd06ba Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Sun, 1 Feb 2026 21:05:19 -0800 Subject: [PATCH] Change: remove file extension filtering from document consumer --- .../management/commands/document_consumer.py | 23 +------- src/documents/parsers.py | 28 ---------- .../tests/test_management_consumer.py | 55 ------------------- src/documents/tests/test_parsers.py | 15 ----- 4 files changed, 2 insertions(+), 119 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 5ba8d30cd..bf80e3108 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -28,7 +28,6 @@ from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentSource from documents.models import Tag -from documents.parsers import get_supported_file_extensions from documents.tasks import consume_file if TYPE_CHECKING: @@ -186,9 +185,6 @@ class ConsumerFilter(DefaultFilter): Extends DefaultFilter leveraging its built-in filtering: - `ignore_dirs`: Directory names to ignore (and all their contents) - `ignore_entity_patterns`: Regex patterns matched against filename/dirname only - - We add custom logic for file extension filtering (only accept supported - document types), which the library doesn't provide. """ # Regex patterns for files to always ignore (matched against filename only) @@ -216,7 +212,6 @@ class ConsumerFilter(DefaultFilter): def __init__( self, *, - supported_extensions: frozenset[str] | None = None, ignore_patterns: list[str] | None = None, ignore_dirs: list[str] | None = None, ) -> None: @@ -224,16 +219,9 @@ class ConsumerFilter(DefaultFilter): Initialize the consumer filter. Args: - supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}). - If None, uses get_supported_file_extensions(). ignore_patterns: Additional regex patterns to ignore (matched against filename). ignore_dirs: Additional directory names to ignore (merged with defaults). """ - # Get supported extensions - if supported_extensions is None: - supported_extensions = frozenset(get_supported_file_extensions()) - self._supported_extensions = supported_extensions - # Combine default and user patterns all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS) if ignore_patterns: @@ -261,8 +249,6 @@ class ConsumerFilter(DefaultFilter): - Hidden files/directories (starting with .) - Directories in ignore_dirs - Files/directories matching ignore_entity_patterns - - We additionally filter files by extension. """ # Let parent filter handle directory ignoring and pattern matching if not super().__call__(change, path): @@ -274,13 +260,8 @@ class ConsumerFilter(DefaultFilter): if path_obj.is_dir(): return True - # For files, check extension - return self._has_supported_extension(path_obj) - - def _has_supported_extension(self, path: Path) -> bool: - """Check if the file has a supported extension.""" - suffix = path.suffix.lower() - return suffix in self._supported_extensions + # No additional filtering beyond the defaults; accept file + return True def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]: diff --git a/src/documents/parsers.py b/src/documents/parsers.py index f6417e285..0ca17d04b 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -85,34 +85,6 @@ def get_default_file_extension(mime_type: str) -> str: return "" -@lru_cache(maxsize=8) -def is_file_ext_supported(ext: str) -> bool: - """ - Returns True if the file extension is supported, False otherwise - TODO: Investigate why this really exists, why not use mimetype - """ - if ext: - return ext.lower() in get_supported_file_extensions() - else: - return False - - -def get_supported_file_extensions() -> set[str]: - extensions = set() - for response in document_consumer_declaration.send(None): - parser_declaration = response[1] - supported_mime_types = parser_declaration["mime_types"] - - for mime_type in supported_mime_types: - extensions.update(mimetypes.guess_all_extensions(mime_type)) - # Python's stdlib might be behind, so also add what the parser - # says is the default extension - # This makes image/webp supported on Python < 3.11 - extensions.add(supported_mime_types[mime_type]) - - return extensions - - def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None: """ Returns the best parser (by weight) for the given mimetype or diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index 810ae63e2..6662800c5 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -90,7 +90,6 @@ def sample_pdf(tmp_path: Path) -> Path: def consumer_filter() -> ConsumerFilter: """Create a ConsumerFilter for testing.""" return ConsumerFilter( - supported_extensions=frozenset({".pdf", ".png", ".jpg"}), ignore_patterns=[r"^custom_ignore"], ) @@ -105,15 +104,6 @@ def mock_consume_file_delay(mocker: MockerFixture) -> MagicMock: return mock_task -@pytest.fixture -def mock_supported_extensions(mocker: MockerFixture) -> MagicMock: - """Mock get_supported_file_extensions to return only .pdf.""" - return mocker.patch( - "documents.management.commands.document_consumer.get_supported_file_extensions", - return_value={".pdf"}, - ) - - def wait_for_mock_call( mock_obj: MagicMock, timeout_s: float = 5.0, @@ -336,8 +326,6 @@ class TestConsumerFilter: pytest.param("image.png", True, id="supported_png"), pytest.param("photo.jpg", True, id="supported_jpg"), pytest.param("document.PDF", True, id="case_insensitive"), - pytest.param("document.xyz", False, id="unsupported_ext"), - pytest.param("document", False, id="no_extension"), pytest.param(".DS_Store", False, id="ds_store"), pytest.param(".DS_STORE", False, id="ds_store_upper"), pytest.param("._document.pdf", False, id="macos_resource_fork"), @@ -395,7 +383,6 @@ class TestConsumerFilter: def test_custom_ignore_dirs(self, tmp_path: Path) -> None: """Test filter respects custom ignore_dirs.""" filter_obj = ConsumerFilter( - supported_extensions=frozenset({".pdf"}), ignore_dirs=["custom_ignored_dir"], ) @@ -415,25 +402,6 @@ class TestConsumerFilter: assert filter_obj(Change.added, str(stfolder)) is False -class TestConsumerFilterDefaults: - """Tests for ConsumerFilter with default settings.""" - - def test_filter_with_mocked_extensions( - self, - tmp_path: Path, - mocker: MockerFixture, - ) -> None: - """Test filter works when using mocked extensions from parser.""" - mocker.patch( - "documents.management.commands.document_consumer.get_supported_file_extensions", - return_value={".pdf", ".png"}, - ) - filter_obj = ConsumerFilter() - test_file = tmp_path / "document.pdf" - test_file.touch() - assert filter_obj(Change.added, str(test_file)) is True - - class TestConsumeFile: """Tests for the _consume_file function.""" @@ -605,7 +573,6 @@ class TestCommandValidation: cmd.handle(directory=str(sample_pdf), oneshot=True, testing=False) -@pytest.mark.usefixtures("mock_supported_extensions") class TestCommandOneshot: """Tests for oneshot mode.""" @@ -652,25 +619,6 @@ class TestCommandOneshot: mock_consume_file_delay.delay.assert_called_once() - def test_ignores_unsupported_extensions( - self, - consumption_dir: Path, - scratch_dir: Path, - mock_consume_file_delay: MagicMock, - settings: SettingsWrapper, - ) -> None: - """Test oneshot mode ignores unsupported file extensions.""" - target = consumption_dir / "document.xyz" - target.write_bytes(b"content") - - settings.SCRATCH_DIR = scratch_dir - settings.CONSUMER_IGNORE_PATTERNS = [] - - cmd = Command() - cmd.handle(directory=str(consumption_dir), oneshot=True, testing=False) - - mock_consume_file_delay.delay.assert_not_called() - class ConsumerThread(Thread): """Thread wrapper for running the consumer command with proper cleanup.""" @@ -739,7 +687,6 @@ class ConsumerThread(Thread): def start_consumer( consumption_dir: Path, scratch_dir: Path, - mock_supported_extensions: MagicMock, ) -> Generator[Callable[..., ConsumerThread], None, None]: """Start a consumer thread and ensure cleanup.""" threads: list[ConsumerThread] = [] @@ -875,7 +822,6 @@ class TestCommandWatch: assert call_args.original_file.name == "valid.pdf" @pytest.mark.django_db - @pytest.mark.usefixtures("mock_supported_extensions") def test_stop_flag_stops_consumer( self, consumption_dir: Path, @@ -1017,7 +963,6 @@ class TestCommandWatchEdgeCases: mock_consume_file_delay.delay.assert_not_called() - @pytest.mark.usefixtures("mock_supported_extensions") def test_handles_task_exception( self, consumption_dir: Path, diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index fee7234e8..199a4b22d 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -7,8 +7,6 @@ from django.test import override_settings from documents.parsers import get_default_file_extension from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import get_supported_file_extensions -from documents.parsers import is_file_ext_supported from paperless_tesseract.parsers import RasterisedDocumentParser from paperless_text.parsers import TextDocumentParser from paperless_tika.parsers import TikaDocumentParser @@ -145,10 +143,7 @@ class TestParserAvailability(TestCase): ("image/webp", ".webp"), ] - supported_exts = get_supported_file_extensions() - for mime_type, ext in supported_mimes_and_exts: - self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( get_parser_class_for_mime_type(mime_type)(logging_group=None), @@ -169,10 +164,7 @@ class TestParserAvailability(TestCase): ("text/csv", ".csv"), ] - supported_exts = get_supported_file_extensions() - for mime_type, ext in supported_mimes_and_exts: - self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( get_parser_class_for_mime_type(mime_type)(logging_group=None), @@ -202,10 +194,8 @@ class TestParserAvailability(TestCase): with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]): app = apps.get_app_config("paperless_tika") app.ready() - supported_exts = get_supported_file_extensions() for mime_type, ext in supported_mimes_and_exts: - self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( get_parser_class_for_mime_type(mime_type)(logging_group=None), @@ -221,8 +211,3 @@ class TestParserAvailability(TestCase): # Test invalid mimetype returns no extension self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "") - - def test_file_extension_support(self): - self.assertTrue(is_file_ext_supported(".pdf")) - self.assertFalse(is_file_ext_supported(".hsdfh")) - self.assertFalse(is_file_ext_supported(""))