Just curious

This commit is contained in:
shamoon
2026-02-01 21:05:19 -08:00
parent a9c0b06e28
commit c6d85a8d28
4 changed files with 0 additions and 109 deletions

View File

@@ -28,7 +28,6 @@ from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Tag
from documents.parsers import get_supported_file_extensions
from documents.tasks import consume_file
if TYPE_CHECKING:
@@ -216,7 +215,6 @@ class ConsumerFilter(DefaultFilter):
def __init__(
self,
*,
supported_extensions: frozenset[str] | None = None,
ignore_patterns: list[str] | None = None,
ignore_dirs: list[str] | None = None,
) -> None:
@@ -224,16 +222,9 @@ class ConsumerFilter(DefaultFilter):
Initialize the consumer filter.
Args:
supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}).
If None, uses get_supported_file_extensions().
ignore_patterns: Additional regex patterns to ignore (matched against filename).
ignore_dirs: Additional directory names to ignore (merged with defaults).
"""
# Get supported extensions
if supported_extensions is None:
supported_extensions = frozenset(get_supported_file_extensions())
self._supported_extensions = supported_extensions
# Combine default and user patterns
all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
if ignore_patterns:
@@ -261,8 +252,6 @@ class ConsumerFilter(DefaultFilter):
- Hidden files/directories (starting with .)
- Directories in ignore_dirs
- Files/directories matching ignore_entity_patterns
We additionally filter files by extension.
"""
# Let parent filter handle directory ignoring and pattern matching
if not super().__call__(change, path):
@@ -274,14 +263,6 @@ class ConsumerFilter(DefaultFilter):
if path_obj.is_dir():
return True
# For files, check extension
return self._has_supported_extension(path_obj)
def _has_supported_extension(self, path: Path) -> bool:
"""Check if the file has a supported extension."""
suffix = path.suffix.lower()
return suffix in self._supported_extensions
def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
"""

View File

@@ -85,34 +85,6 @@ def get_default_file_extension(mime_type: str) -> str:
return ""
@lru_cache(maxsize=8)
def is_file_ext_supported(ext: str) -> bool:
"""
Returns True if the file extension is supported, False otherwise
TODO: Investigate why this really exists, why not use mimetype
"""
if ext:
return ext.lower() in get_supported_file_extensions()
else:
return False
def get_supported_file_extensions() -> set[str]:
extensions = set()
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
# Python's stdlib might be behind, so also add what the parser
# says is the default extension
# This makes image/webp supported on Python < 3.11
extensions.add(supported_mime_types[mime_type])
return extensions
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
"""
Returns the best parser (by weight) for the given mimetype or

View File

@@ -90,7 +90,6 @@ def sample_pdf(tmp_path: Path) -> Path:
def consumer_filter() -> ConsumerFilter:
"""Create a ConsumerFilter for testing."""
return ConsumerFilter(
supported_extensions=frozenset({".pdf", ".png", ".jpg"}),
ignore_patterns=[r"^custom_ignore"],
)
@@ -105,15 +104,6 @@ def mock_consume_file_delay(mocker: MockerFixture) -> MagicMock:
return mock_task
@pytest.fixture
def mock_supported_extensions(mocker: MockerFixture) -> MagicMock:
"""Mock get_supported_file_extensions to return only .pdf."""
return mocker.patch(
"documents.management.commands.document_consumer.get_supported_file_extensions",
return_value={".pdf"},
)
def wait_for_mock_call(
mock_obj: MagicMock,
timeout_s: float = 5.0,
@@ -395,7 +385,6 @@ class TestConsumerFilter:
def test_custom_ignore_dirs(self, tmp_path: Path) -> None:
"""Test filter respects custom ignore_dirs."""
filter_obj = ConsumerFilter(
supported_extensions=frozenset({".pdf"}),
ignore_dirs=["custom_ignored_dir"],
)
@@ -415,25 +404,6 @@ class TestConsumerFilter:
assert filter_obj(Change.added, str(stfolder)) is False
class TestConsumerFilterDefaults:
"""Tests for ConsumerFilter with default settings."""
def test_filter_with_mocked_extensions(
self,
tmp_path: Path,
mocker: MockerFixture,
) -> None:
"""Test filter works when using mocked extensions from parser."""
mocker.patch(
"documents.management.commands.document_consumer.get_supported_file_extensions",
return_value={".pdf", ".png"},
)
filter_obj = ConsumerFilter()
test_file = tmp_path / "document.pdf"
test_file.touch()
assert filter_obj(Change.added, str(test_file)) is True
class TestConsumeFile:
"""Tests for the _consume_file function."""
@@ -605,7 +575,6 @@ class TestCommandValidation:
cmd.handle(directory=str(sample_pdf), oneshot=True, testing=False)
@pytest.mark.usefixtures("mock_supported_extensions")
class TestCommandOneshot:
"""Tests for oneshot mode."""
@@ -652,25 +621,6 @@ class TestCommandOneshot:
mock_consume_file_delay.delay.assert_called_once()
def test_ignores_unsupported_extensions(
self,
consumption_dir: Path,
scratch_dir: Path,
mock_consume_file_delay: MagicMock,
settings: SettingsWrapper,
) -> None:
"""Test oneshot mode ignores unsupported file extensions."""
target = consumption_dir / "document.xyz"
target.write_bytes(b"content")
settings.SCRATCH_DIR = scratch_dir
settings.CONSUMER_IGNORE_PATTERNS = []
cmd = Command()
cmd.handle(directory=str(consumption_dir), oneshot=True, testing=False)
mock_consume_file_delay.delay.assert_not_called()
class ConsumerThread(Thread):
"""Thread wrapper for running the consumer command with proper cleanup."""
@@ -739,7 +689,6 @@ class ConsumerThread(Thread):
def start_consumer(
consumption_dir: Path,
scratch_dir: Path,
mock_supported_extensions: MagicMock,
) -> Generator[Callable[..., ConsumerThread], None, None]:
"""Start a consumer thread and ensure cleanup."""
threads: list[ConsumerThread] = []
@@ -875,7 +824,6 @@ class TestCommandWatch:
assert call_args.original_file.name == "valid.pdf"
@pytest.mark.django_db
@pytest.mark.usefixtures("mock_supported_extensions")
def test_stop_flag_stops_consumer(
self,
consumption_dir: Path,
@@ -1017,7 +965,6 @@ class TestCommandWatchEdgeCases:
mock_consume_file_delay.delay.assert_not_called()
@pytest.mark.usefixtures("mock_supported_extensions")
def test_handles_task_exception(
self,
consumption_dir: Path,

View File

@@ -7,7 +7,6 @@ from django.test import override_settings
from documents.parsers import get_default_file_extension
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import get_supported_file_extensions
from documents.parsers import is_file_ext_supported
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
@@ -145,10 +144,7 @@ class TestParserAvailability(TestCase):
("image/webp", ".webp"),
]
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),
@@ -169,10 +165,7 @@ class TestParserAvailability(TestCase):
("text/csv", ".csv"),
]
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),
@@ -202,10 +195,8 @@ class TestParserAvailability(TestCase):
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
app = apps.get_app_config("paperless_tika")
app.ready()
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),