From c6d85a8d28c3dcaaed6ad8674db7cd430ba18fac Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Sun, 1 Feb 2026 21:05:19 -0800 Subject: [PATCH] Just curious --- .../management/commands/document_consumer.py | 19 ------- src/documents/parsers.py | 28 ---------- .../tests/test_management_consumer.py | 53 ------------------- src/documents/tests/test_parsers.py | 9 ---- 4 files changed, 109 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 5ba8d30cd..c958b20a3 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -28,7 +28,6 @@ from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentSource from documents.models import Tag -from documents.parsers import get_supported_file_extensions from documents.tasks import consume_file if TYPE_CHECKING: @@ -216,7 +215,6 @@ class ConsumerFilter(DefaultFilter): def __init__( self, *, - supported_extensions: frozenset[str] | None = None, ignore_patterns: list[str] | None = None, ignore_dirs: list[str] | None = None, ) -> None: @@ -224,16 +222,9 @@ class ConsumerFilter(DefaultFilter): Initialize the consumer filter. Args: - supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}). - If None, uses get_supported_file_extensions(). ignore_patterns: Additional regex patterns to ignore (matched against filename). ignore_dirs: Additional directory names to ignore (merged with defaults). """ - # Get supported extensions - if supported_extensions is None: - supported_extensions = frozenset(get_supported_file_extensions()) - self._supported_extensions = supported_extensions - # Combine default and user patterns all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS) if ignore_patterns: @@ -261,8 +252,6 @@ class ConsumerFilter(DefaultFilter): - Hidden files/directories (starting with .) - Directories in ignore_dirs - Files/directories matching ignore_entity_patterns - - We additionally filter files by extension. """ # Let parent filter handle directory ignoring and pattern matching if not super().__call__(change, path): @@ -274,14 +263,6 @@ class ConsumerFilter(DefaultFilter): if path_obj.is_dir(): return True - # For files, check extension - return self._has_supported_extension(path_obj) - - def _has_supported_extension(self, path: Path) -> bool: - """Check if the file has a supported extension.""" - suffix = path.suffix.lower() - return suffix in self._supported_extensions - def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]: """ diff --git a/src/documents/parsers.py b/src/documents/parsers.py index f6417e285..0ca17d04b 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -85,34 +85,6 @@ def get_default_file_extension(mime_type: str) -> str: return "" -@lru_cache(maxsize=8) -def is_file_ext_supported(ext: str) -> bool: - """ - Returns True if the file extension is supported, False otherwise - TODO: Investigate why this really exists, why not use mimetype - """ - if ext: - return ext.lower() in get_supported_file_extensions() - else: - return False - - -def get_supported_file_extensions() -> set[str]: - extensions = set() - for response in document_consumer_declaration.send(None): - parser_declaration = response[1] - supported_mime_types = parser_declaration["mime_types"] - - for mime_type in supported_mime_types: - extensions.update(mimetypes.guess_all_extensions(mime_type)) - # Python's stdlib might be behind, so also add what the parser - # says is the default extension - # This makes image/webp supported on Python < 3.11 - extensions.add(supported_mime_types[mime_type]) - - return extensions - - def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None: """ Returns the best parser (by weight) for the given mimetype or diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index 810ae63e2..86899ccf2 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -90,7 +90,6 @@ def sample_pdf(tmp_path: Path) -> Path: def consumer_filter() -> ConsumerFilter: """Create a ConsumerFilter for testing.""" return ConsumerFilter( - supported_extensions=frozenset({".pdf", ".png", ".jpg"}), ignore_patterns=[r"^custom_ignore"], ) @@ -105,15 +104,6 @@ def mock_consume_file_delay(mocker: MockerFixture) -> MagicMock: return mock_task -@pytest.fixture -def mock_supported_extensions(mocker: MockerFixture) -> MagicMock: - """Mock get_supported_file_extensions to return only .pdf.""" - return mocker.patch( - "documents.management.commands.document_consumer.get_supported_file_extensions", - return_value={".pdf"}, - ) - - def wait_for_mock_call( mock_obj: MagicMock, timeout_s: float = 5.0, @@ -395,7 +385,6 @@ class TestConsumerFilter: def test_custom_ignore_dirs(self, tmp_path: Path) -> None: """Test filter respects custom ignore_dirs.""" filter_obj = ConsumerFilter( - supported_extensions=frozenset({".pdf"}), ignore_dirs=["custom_ignored_dir"], ) @@ -415,25 +404,6 @@ class TestConsumerFilter: assert filter_obj(Change.added, str(stfolder)) is False -class TestConsumerFilterDefaults: - """Tests for ConsumerFilter with default settings.""" - - def test_filter_with_mocked_extensions( - self, - tmp_path: Path, - mocker: MockerFixture, - ) -> None: - """Test filter works when using mocked extensions from parser.""" - mocker.patch( - "documents.management.commands.document_consumer.get_supported_file_extensions", - return_value={".pdf", ".png"}, - ) - filter_obj = ConsumerFilter() - test_file = tmp_path / "document.pdf" - test_file.touch() - assert filter_obj(Change.added, str(test_file)) is True - - class TestConsumeFile: """Tests for the _consume_file function.""" @@ -605,7 +575,6 @@ class TestCommandValidation: cmd.handle(directory=str(sample_pdf), oneshot=True, testing=False) -@pytest.mark.usefixtures("mock_supported_extensions") class TestCommandOneshot: """Tests for oneshot mode.""" @@ -652,25 +621,6 @@ class TestCommandOneshot: mock_consume_file_delay.delay.assert_called_once() - def test_ignores_unsupported_extensions( - self, - consumption_dir: Path, - scratch_dir: Path, - mock_consume_file_delay: MagicMock, - settings: SettingsWrapper, - ) -> None: - """Test oneshot mode ignores unsupported file extensions.""" - target = consumption_dir / "document.xyz" - target.write_bytes(b"content") - - settings.SCRATCH_DIR = scratch_dir - settings.CONSUMER_IGNORE_PATTERNS = [] - - cmd = Command() - cmd.handle(directory=str(consumption_dir), oneshot=True, testing=False) - - mock_consume_file_delay.delay.assert_not_called() - class ConsumerThread(Thread): """Thread wrapper for running the consumer command with proper cleanup.""" @@ -739,7 +689,6 @@ class ConsumerThread(Thread): def start_consumer( consumption_dir: Path, scratch_dir: Path, - mock_supported_extensions: MagicMock, ) -> Generator[Callable[..., ConsumerThread], None, None]: """Start a consumer thread and ensure cleanup.""" threads: list[ConsumerThread] = [] @@ -875,7 +824,6 @@ class TestCommandWatch: assert call_args.original_file.name == "valid.pdf" @pytest.mark.django_db - @pytest.mark.usefixtures("mock_supported_extensions") def test_stop_flag_stops_consumer( self, consumption_dir: Path, @@ -1017,7 +965,6 @@ class TestCommandWatchEdgeCases: mock_consume_file_delay.delay.assert_not_called() - @pytest.mark.usefixtures("mock_supported_extensions") def test_handles_task_exception( self, consumption_dir: Path, diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index fee7234e8..ccacb15be 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -7,7 +7,6 @@ from django.test import override_settings from documents.parsers import get_default_file_extension from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported from paperless_tesseract.parsers import RasterisedDocumentParser from paperless_text.parsers import TextDocumentParser @@ -145,10 +144,7 @@ class TestParserAvailability(TestCase): ("image/webp", ".webp"), ] - supported_exts = get_supported_file_extensions() - for mime_type, ext in supported_mimes_and_exts: - self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( get_parser_class_for_mime_type(mime_type)(logging_group=None), @@ -169,10 +165,7 @@ class TestParserAvailability(TestCase): ("text/csv", ".csv"), ] - supported_exts = get_supported_file_extensions() - for mime_type, ext in supported_mimes_and_exts: - self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( get_parser_class_for_mime_type(mime_type)(logging_group=None), @@ -202,10 +195,8 @@ class TestParserAvailability(TestCase): with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]): app = apps.get_app_config("paperless_tika") app.ready() - supported_exts = get_supported_file_extensions() for mime_type, ext in supported_mimes_and_exts: - self.assertIn(ext, supported_exts) self.assertEqual(get_default_file_extension(mime_type), ext) self.assertIsInstance( get_parser_class_for_mime_type(mime_type)(logging_group=None),