Cleans up and improves parser discovery testing, simplifies the determination of supported or not supported extensions and mime types

2025-12-16 01:31:09 -06:00 · 2023-01-04 10:18:31 -08:00
parent a340b9c8a1
commit d19bf59f47
2 changed files with 109 additions and 49 deletions
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -6,12 +6,12 @@ import re
 import shutil
 import subprocess
 import tempfile
+from functools import cache
 from typing import Iterator
 from typing import Match
 from typing import Optional
 from typing import Set

-import magic
 from django.conf import settings
 from django.utils import timezone
 from documents.loggers import LoggingMixin
@@ -45,11 +45,20 @@ DATE_REGEX = re.compile(
 logger = logging.getLogger("paperless.parsing")


-def is_mime_type_supported(mime_type) -> bool:
+@cache
+def is_mime_type_supported(mime_type: str) -> bool:
+    """
+    Returns True if the mime type is supported, False otherwise
+    """
    return get_parser_class_for_mime_type(mime_type) is not None


-def get_default_file_extension(mime_type) -> str:
+@cache
+def get_default_file_extension(mime_type: str) -> str:
+    """
+    Returns the default file extension for a mimetype, or
+    an empty string if it could not be determined
+    """
    for response in document_consumer_declaration.send(None):
        parser_declaration = response[1]
        supported_mime_types = parser_declaration["mime_types"]
@@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str:
        return ""


-def is_file_ext_supported(ext) -> bool:
+@cache
+def is_file_ext_supported(ext: str) -> bool:
+    """
+    Returns True if the file extension is supported, False otherwise
+    TODO: Investigate why this really exists, why not use mimetype
+    """
    if ext:
        return ext.lower() in get_supported_file_extensions()
    else:
@@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]:

        for mime_type in supported_mime_types:
            extensions.update(mimetypes.guess_all_extensions(mime_type))
+            # Python's stdlib might be behind, so also add what the parser
+            # says is the default extension
+            # This makes image/webp supported on Python < 3.11
+            extensions.add(supported_mime_types[mime_type])

    return extensions


-def get_parser_class_for_mime_type(mime_type):
+def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
+    """
+    Returns the best parser (by weight) for the given mimetype or
+    None if no parser exists
+    """

    options = []

@@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type):
    return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]


-def get_parser_class(path):
-    """
-    Determine the appropriate parser class based on the file
-    """
-
-    mime_type = magic.from_file(path, mime=True)
-
-    return get_parser_class_for_mime_type(mime_type)
-
-
 def run_convert(
    input_file,
    output_file,