Cleans up and improves parser discovery testing, simplifies the determination of supported or not supported extensions and mime types

2025-12-31 13:58:04 -06:00 · 2023-01-04 10:18:31 -08:00
parent a340b9c8a1
commit d19bf59f47
2 changed files with 109 additions and 49 deletions
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -6,12 +6,12 @@ import re
 import shutil
 import subprocess
 import tempfile
 from functools import cache
 from typing import Iterator
 from typing import Match
 from typing import Optional
 from typing import Set
 import magic
 from django.conf import settings
 from django.utils import timezone
 from documents.loggers import LoggingMixin
@@ -45,11 +45,20 @@ DATE_REGEX = re.compile(
 logger = logging.getLogger("paperless.parsing")
-def is_mime_type_supported(mime_type) -> bool:
+@cache
 def is_mime_type_supported(mime_type: str) -> bool:
    """
    Returns True if the mime type is supported, False otherwise
    """
    return get_parser_class_for_mime_type(mime_type) is not None
-def get_default_file_extension(mime_type) -> str:
+@cache
 def get_default_file_extension(mime_type: str) -> str:
    """
    Returns the default file extension for a mimetype, or
    an empty string if it could not be determined
    """
    for response in document_consumer_declaration.send(None):
        parser_declaration = response[1]
        supported_mime_types = parser_declaration["mime_types"]
@@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str:
        return ""
-def is_file_ext_supported(ext) -> bool:
+@cache
 def is_file_ext_supported(ext: str) -> bool:
    """
    Returns True if the file extension is supported, False otherwise
    TODO: Investigate why this really exists, why not use mimetype
    """
    if ext:
        return ext.lower() in get_supported_file_extensions()
    else:
@@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]:
        for mime_type in supported_mime_types:
            extensions.update(mimetypes.guess_all_extensions(mime_type))
            # Python's stdlib might be behind, so also add what the parser
            # says is the default extension
            # This makes image/webp supported on Python < 3.11
            extensions.add(supported_mime_types[mime_type])
    return extensions
-def get_parser_class_for_mime_type(mime_type):
+def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
    """
    Returns the best parser (by weight) for the given mimetype or
    None if no parser exists
    """
    options = []
@@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type):
    return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
 def get_parser_class(path):
    """
    Determine the appropriate parser class based on the file
    """
    mime_type = magic.from_file(path, mime=True)
    return get_parser_class_for_mime_type(mime_type)
 def run_convert(
    input_file,
    output_file,
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,14 +1,8 @@
 import os
 import shutil
 import tempfile
 from tempfile import TemporaryDirectory
 from unittest import mock
 from django.test import override_settings
 from django.test import TestCase
 from documents.parsers import DocumentParser
 from documents.parsers import get_default_file_extension
 from documents.parsers import get_parser_class
 from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import get_supported_file_extensions
 from documents.parsers import is_file_ext_supported
@@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser
 from paperless_text.parsers import TextDocumentParser
 def fake_magic_from_file(file, mime=False):
    if mime:
        if os.path.splitext(file)[1] == ".pdf":
            return "application/pdf"
        else:
            return "unknown"
    else:
        return "A verbose string that describes the contents of the file"
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
 class TestParserDiscovery(TestCase):
    @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test__get_parser_class_1_parser(self, m, *args):
+    def test_get_parser_class_1_parser(self, m, *args):
        """
        GIVEN:
            - Parser declared for a given mimetype
        WHEN:
            - Attempt to get parser for the mimetype
        THEN:
            - Declared parser class is returned
        """
        class DummyParser:
            pass
@@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase):
            ),
        )
-        self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
+        self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
    @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test__get_parser_class_n_parsers(self, m, *args):
+    def test_get_parser_class_n_parsers(self, m, *args):
        """
        GIVEN:
            - Two parsers declared for a given mimetype
            - Second parser has a higher weight
        WHEN:
            - Attempt to get parser for the mimetype
        THEN:
            - Second parser class is returned
        """
        class DummyParser1:
            pass
@@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase):
            ),
        )
-        self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
+        self.assertEqual(
            get_parser_class_for_mime_type("application/pdf"),
            DummyParser2,
        )
    @mock.patch("documents.parsers.document_consumer_declaration.send")
-    def test__get_parser_class_0_parsers(self, m, *args):
+    def test_get_parser_class_0_parsers(self, m, *args):
        """
        GIVEN:
            - No parsers are declared
        WHEN:
            - Attempt to get parser for the mimetype
        THEN:
            - No parser class is returned
        """
        m.return_value = []
        with TemporaryDirectory() as tmpdir:
-            self.assertIsNone(get_parser_class("doc.pdf"))
+            self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def test_get_parser_class_no_valid_parser(self, m, *args):
        """
        GIVEN:
            - No parser declared for a given mimetype
            - Parser declared for a different mimetype
        WHEN:
            - Attempt to get parser for the given mimetype
        THEN:
            - No parser class is returned
        """
-def fake_get_thumbnail(self, path, mimetype, file_name):
+        class DummyParser:
-    return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
+            pass
        m.return_value = (
            (
                None,
                {
                    "weight": 0,
                    "parser": DummyParser,
                    "mime_types": {"application/pdf": ".pdf"},
                },
            ),
        )
        self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
 class TestParserAvailability(TestCase):
    def test_file_extensions(self):
-        for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
+        supported_mimes_and_exts = [
-            self.assertIn(ext, get_supported_file_extensions())
+            ("application/pdf", ".pdf"),
-        self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
+            ("image/png", ".png"),
-        self.assertEqual(get_default_file_extension("image/png"), ".png")
+            ("image/jpeg", ".jpg"),
-        self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
+            ("image/tiff", ".tif"),
-        self.assertEqual(get_default_file_extension("text/plain"), ".txt")
+            ("image/webp", ".webp"),
-        self.assertEqual(get_default_file_extension("text/csv"), ".csv")
+            ("text/plain", ".txt"),
            ("text/csv", ".csv"),
        ]
        supported_exts = get_supported_file_extensions()
        for mime_type, ext in supported_mimes_and_exts:
            self.assertIn(ext, supported_exts)
            self.assertEqual(get_default_file_extension(mime_type), ext)
        # Test no parser declared still returns a an extension
        self.assertEqual(get_default_file_extension("application/zip"), ".zip")
        # Test invalid mimetype returns no extension
        self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
        self.assertIsInstance(
@@ -108,7 +156,7 @@ class TestParserAvailability(TestCase):
            get_parser_class_for_mime_type("text/plain")(logging_group=None),
            TextDocumentParser,
        )
-        self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
+        self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
        self.assertTrue(is_file_ext_supported(".pdf"))
        self.assertFalse(is_file_ext_supported(".hsdfh"))