mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-02-01 23:19:00 -06:00
Just curious
This commit is contained in:
@@ -28,7 +28,6 @@ from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.models import Tag
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.tasks import consume_file
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -216,7 +215,6 @@ class ConsumerFilter(DefaultFilter):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
supported_extensions: frozenset[str] | None = None,
|
||||
ignore_patterns: list[str] | None = None,
|
||||
ignore_dirs: list[str] | None = None,
|
||||
) -> None:
|
||||
@@ -224,16 +222,9 @@ class ConsumerFilter(DefaultFilter):
|
||||
Initialize the consumer filter.
|
||||
|
||||
Args:
|
||||
supported_extensions: Set of file extensions to accept (e.g., {".pdf", ".png"}).
|
||||
If None, uses get_supported_file_extensions().
|
||||
ignore_patterns: Additional regex patterns to ignore (matched against filename).
|
||||
ignore_dirs: Additional directory names to ignore (merged with defaults).
|
||||
"""
|
||||
# Get supported extensions
|
||||
if supported_extensions is None:
|
||||
supported_extensions = frozenset(get_supported_file_extensions())
|
||||
self._supported_extensions = supported_extensions
|
||||
|
||||
# Combine default and user patterns
|
||||
all_patterns: list[str] = list(self.DEFAULT_IGNORE_PATTERNS)
|
||||
if ignore_patterns:
|
||||
@@ -261,8 +252,6 @@ class ConsumerFilter(DefaultFilter):
|
||||
- Hidden files/directories (starting with .)
|
||||
- Directories in ignore_dirs
|
||||
- Files/directories matching ignore_entity_patterns
|
||||
|
||||
We additionally filter files by extension.
|
||||
"""
|
||||
# Let parent filter handle directory ignoring and pattern matching
|
||||
if not super().__call__(change, path):
|
||||
@@ -274,14 +263,6 @@ class ConsumerFilter(DefaultFilter):
|
||||
if path_obj.is_dir():
|
||||
return True
|
||||
|
||||
# For files, check extension
|
||||
return self._has_supported_extension(path_obj)
|
||||
|
||||
def _has_supported_extension(self, path: Path) -> bool:
|
||||
"""Check if the file has a supported extension."""
|
||||
suffix = path.suffix.lower()
|
||||
return suffix in self._supported_extensions
|
||||
|
||||
|
||||
def _tags_from_path(filepath: Path, consumption_dir: Path) -> list[int]:
|
||||
"""
|
||||
|
||||
@@ -85,34 +85,6 @@ def get_default_file_extension(mime_type: str) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def is_file_ext_supported(ext: str) -> bool:
|
||||
"""
|
||||
Returns True if the file extension is supported, False otherwise
|
||||
TODO: Investigate why this really exists, why not use mimetype
|
||||
"""
|
||||
if ext:
|
||||
return ext.lower() in get_supported_file_extensions()
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def get_supported_file_extensions() -> set[str]:
|
||||
extensions = set()
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
for mime_type in supported_mime_types:
|
||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||
# Python's stdlib might be behind, so also add what the parser
|
||||
# says is the default extension
|
||||
# This makes image/webp supported on Python < 3.11
|
||||
extensions.add(supported_mime_types[mime_type])
|
||||
|
||||
return extensions
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type: str) -> type[DocumentParser] | None:
|
||||
"""
|
||||
Returns the best parser (by weight) for the given mimetype or
|
||||
|
||||
@@ -90,7 +90,6 @@ def sample_pdf(tmp_path: Path) -> Path:
|
||||
def consumer_filter() -> ConsumerFilter:
|
||||
"""Create a ConsumerFilter for testing."""
|
||||
return ConsumerFilter(
|
||||
supported_extensions=frozenset({".pdf", ".png", ".jpg"}),
|
||||
ignore_patterns=[r"^custom_ignore"],
|
||||
)
|
||||
|
||||
@@ -105,15 +104,6 @@ def mock_consume_file_delay(mocker: MockerFixture) -> MagicMock:
|
||||
return mock_task
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_supported_extensions(mocker: MockerFixture) -> MagicMock:
|
||||
"""Mock get_supported_file_extensions to return only .pdf."""
|
||||
return mocker.patch(
|
||||
"documents.management.commands.document_consumer.get_supported_file_extensions",
|
||||
return_value={".pdf"},
|
||||
)
|
||||
|
||||
|
||||
def wait_for_mock_call(
|
||||
mock_obj: MagicMock,
|
||||
timeout_s: float = 5.0,
|
||||
@@ -395,7 +385,6 @@ class TestConsumerFilter:
|
||||
def test_custom_ignore_dirs(self, tmp_path: Path) -> None:
|
||||
"""Test filter respects custom ignore_dirs."""
|
||||
filter_obj = ConsumerFilter(
|
||||
supported_extensions=frozenset({".pdf"}),
|
||||
ignore_dirs=["custom_ignored_dir"],
|
||||
)
|
||||
|
||||
@@ -415,25 +404,6 @@ class TestConsumerFilter:
|
||||
assert filter_obj(Change.added, str(stfolder)) is False
|
||||
|
||||
|
||||
class TestConsumerFilterDefaults:
|
||||
"""Tests for ConsumerFilter with default settings."""
|
||||
|
||||
def test_filter_with_mocked_extensions(
|
||||
self,
|
||||
tmp_path: Path,
|
||||
mocker: MockerFixture,
|
||||
) -> None:
|
||||
"""Test filter works when using mocked extensions from parser."""
|
||||
mocker.patch(
|
||||
"documents.management.commands.document_consumer.get_supported_file_extensions",
|
||||
return_value={".pdf", ".png"},
|
||||
)
|
||||
filter_obj = ConsumerFilter()
|
||||
test_file = tmp_path / "document.pdf"
|
||||
test_file.touch()
|
||||
assert filter_obj(Change.added, str(test_file)) is True
|
||||
|
||||
|
||||
class TestConsumeFile:
|
||||
"""Tests for the _consume_file function."""
|
||||
|
||||
@@ -605,7 +575,6 @@ class TestCommandValidation:
|
||||
cmd.handle(directory=str(sample_pdf), oneshot=True, testing=False)
|
||||
|
||||
|
||||
@pytest.mark.usefixtures("mock_supported_extensions")
|
||||
class TestCommandOneshot:
|
||||
"""Tests for oneshot mode."""
|
||||
|
||||
@@ -652,25 +621,6 @@ class TestCommandOneshot:
|
||||
|
||||
mock_consume_file_delay.delay.assert_called_once()
|
||||
|
||||
def test_ignores_unsupported_extensions(
|
||||
self,
|
||||
consumption_dir: Path,
|
||||
scratch_dir: Path,
|
||||
mock_consume_file_delay: MagicMock,
|
||||
settings: SettingsWrapper,
|
||||
) -> None:
|
||||
"""Test oneshot mode ignores unsupported file extensions."""
|
||||
target = consumption_dir / "document.xyz"
|
||||
target.write_bytes(b"content")
|
||||
|
||||
settings.SCRATCH_DIR = scratch_dir
|
||||
settings.CONSUMER_IGNORE_PATTERNS = []
|
||||
|
||||
cmd = Command()
|
||||
cmd.handle(directory=str(consumption_dir), oneshot=True, testing=False)
|
||||
|
||||
mock_consume_file_delay.delay.assert_not_called()
|
||||
|
||||
|
||||
class ConsumerThread(Thread):
|
||||
"""Thread wrapper for running the consumer command with proper cleanup."""
|
||||
@@ -739,7 +689,6 @@ class ConsumerThread(Thread):
|
||||
def start_consumer(
|
||||
consumption_dir: Path,
|
||||
scratch_dir: Path,
|
||||
mock_supported_extensions: MagicMock,
|
||||
) -> Generator[Callable[..., ConsumerThread], None, None]:
|
||||
"""Start a consumer thread and ensure cleanup."""
|
||||
threads: list[ConsumerThread] = []
|
||||
@@ -875,7 +824,6 @@ class TestCommandWatch:
|
||||
assert call_args.original_file.name == "valid.pdf"
|
||||
|
||||
@pytest.mark.django_db
|
||||
@pytest.mark.usefixtures("mock_supported_extensions")
|
||||
def test_stop_flag_stops_consumer(
|
||||
self,
|
||||
consumption_dir: Path,
|
||||
@@ -1017,7 +965,6 @@ class TestCommandWatchEdgeCases:
|
||||
|
||||
mock_consume_file_delay.delay.assert_not_called()
|
||||
|
||||
@pytest.mark.usefixtures("mock_supported_extensions")
|
||||
def test_handles_task_exception(
|
||||
self,
|
||||
consumption_dir: Path,
|
||||
|
||||
@@ -7,7 +7,6 @@ from django.test import override_settings
|
||||
|
||||
from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
@@ -145,10 +144,7 @@ class TestParserAvailability(TestCase):
|
||||
("image/webp", ".webp"),
|
||||
]
|
||||
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
@@ -169,10 +165,7 @@ class TestParserAvailability(TestCase):
|
||||
("text/csv", ".csv"),
|
||||
]
|
||||
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
@@ -202,10 +195,8 @@ class TestParserAvailability(TestCase):
|
||||
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
|
||||
app = apps.get_app_config("paperless_tika")
|
||||
app.ready()
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
self.assertIsInstance(
|
||||
get_parser_class_for_mime_type(mime_type)(logging_group=None),
|
||||
|
||||
Reference in New Issue
Block a user