diff --git a/src/documents/parsers.py b/src/documents/parsers.py index e2309b366..240d60e7f 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -6,12 +6,12 @@ import re import shutil import subprocess import tempfile +from functools import cache from typing import Iterator from typing import Match from typing import Optional from typing import Set -import magic from django.conf import settings from django.utils import timezone from documents.loggers import LoggingMixin @@ -45,11 +45,20 @@ DATE_REGEX = re.compile( logger = logging.getLogger("paperless.parsing") -def is_mime_type_supported(mime_type) -> bool: +@cache +def is_mime_type_supported(mime_type: str) -> bool: + """ + Returns True if the mime type is supported, False otherwise + """ return get_parser_class_for_mime_type(mime_type) is not None -def get_default_file_extension(mime_type) -> str: +@cache +def get_default_file_extension(mime_type: str) -> str: + """ + Returns the default file extension for a mimetype, or + an empty string if it could not be determined + """ for response in document_consumer_declaration.send(None): parser_declaration = response[1] supported_mime_types = parser_declaration["mime_types"] @@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str: return "" -def is_file_ext_supported(ext) -> bool: +@cache +def is_file_ext_supported(ext: str) -> bool: + """ + Returns True if the file extension is supported, False otherwise + TODO: Investigate why this really exists, why not use mimetype + """ if ext: return ext.lower() in get_supported_file_extensions() else: @@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]: for mime_type in supported_mime_types: extensions.update(mimetypes.guess_all_extensions(mime_type)) + # Python's stdlib might be behind, so also add what the parser + # says is the default extension + # This makes image/webp supported on Python < 3.11 + extensions.add(supported_mime_types[mime_type]) return extensions -def get_parser_class_for_mime_type(mime_type): +def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]: + """ + Returns the best parser (by weight) for the given mimetype or + None if no parser exists + """ options = [] @@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type): return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"] -def get_parser_class(path): - """ - Determine the appropriate parser class based on the file - """ - - mime_type = magic.from_file(path, mime=True) - - return get_parser_class_for_mime_type(mime_type) - - def run_convert( input_file, output_file, diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index 1942fe0dd..8ba2c70ee 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -1,14 +1,8 @@ -import os -import shutil -import tempfile from tempfile import TemporaryDirectory from unittest import mock -from django.test import override_settings from django.test import TestCase -from documents.parsers import DocumentParser from documents.parsers import get_default_file_extension -from documents.parsers import get_parser_class from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_supported_file_extensions from documents.parsers import is_file_ext_supported @@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser from paperless_text.parsers import TextDocumentParser -def fake_magic_from_file(file, mime=False): - - if mime: - if os.path.splitext(file)[1] == ".pdf": - return "application/pdf" - else: - return "unknown" - else: - return "A verbose string that describes the contents of the file" - - -@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file) class TestParserDiscovery(TestCase): @mock.patch("documents.parsers.document_consumer_declaration.send") - def test__get_parser_class_1_parser(self, m, *args): + def test_get_parser_class_1_parser(self, m, *args): + """ + GIVEN: + - Parser declared for a given mimetype + WHEN: + - Attempt to get parser for the mimetype + THEN: + - Declared parser class is returned + """ + class DummyParser: pass @@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase): ), ) - self.assertEqual(get_parser_class("doc.pdf"), DummyParser) + self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser) @mock.patch("documents.parsers.document_consumer_declaration.send") - def test__get_parser_class_n_parsers(self, m, *args): + def test_get_parser_class_n_parsers(self, m, *args): + """ + GIVEN: + - Two parsers declared for a given mimetype + - Second parser has a higher weight + WHEN: + - Attempt to get parser for the mimetype + THEN: + - Second parser class is returned + """ + class DummyParser1: pass @@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase): ), ) - self.assertEqual(get_parser_class("doc.pdf"), DummyParser2) + self.assertEqual( + get_parser_class_for_mime_type("application/pdf"), + DummyParser2, + ) @mock.patch("documents.parsers.document_consumer_declaration.send") - def test__get_parser_class_0_parsers(self, m, *args): + def test_get_parser_class_0_parsers(self, m, *args): + """ + GIVEN: + - No parsers are declared + WHEN: + - Attempt to get parser for the mimetype + THEN: + - No parser class is returned + """ m.return_value = [] with TemporaryDirectory() as tmpdir: - self.assertIsNone(get_parser_class("doc.pdf")) + self.assertIsNone(get_parser_class_for_mime_type("application/pdf")) + @mock.patch("documents.parsers.document_consumer_declaration.send") + def test_get_parser_class_no_valid_parser(self, m, *args): + """ + GIVEN: + - No parser declared for a given mimetype + - Parser declared for a different mimetype + WHEN: + - Attempt to get parser for the given mimetype + THEN: + - No parser class is returned + """ -def fake_get_thumbnail(self, path, mimetype, file_name): - return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") + class DummyParser: + pass + + m.return_value = ( + ( + None, + { + "weight": 0, + "parser": DummyParser, + "mime_types": {"application/pdf": ".pdf"}, + }, + ), + ) + + self.assertIsNone(get_parser_class_for_mime_type("image/tiff")) class TestParserAvailability(TestCase): def test_file_extensions(self): - for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]: - self.assertIn(ext, get_supported_file_extensions()) - self.assertEqual(get_default_file_extension("application/pdf"), ".pdf") - self.assertEqual(get_default_file_extension("image/png"), ".png") - self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg") - self.assertEqual(get_default_file_extension("text/plain"), ".txt") - self.assertEqual(get_default_file_extension("text/csv"), ".csv") + supported_mimes_and_exts = [ + ("application/pdf", ".pdf"), + ("image/png", ".png"), + ("image/jpeg", ".jpg"), + ("image/tiff", ".tif"), + ("image/webp", ".webp"), + ("text/plain", ".txt"), + ("text/csv", ".csv"), + ] + + supported_exts = get_supported_file_extensions() + + for mime_type, ext in supported_mimes_and_exts: + self.assertIn(ext, supported_exts) + self.assertEqual(get_default_file_extension(mime_type), ext) + + # Test no parser declared still returns a an extension self.assertEqual(get_default_file_extension("application/zip"), ".zip") + + # Test invalid mimetype returns no extension self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "") self.assertIsInstance( @@ -108,7 +156,7 @@ class TestParserAvailability(TestCase): get_parser_class_for_mime_type("text/plain")(logging_group=None), TextDocumentParser, ) - self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None) + self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf")) self.assertTrue(is_file_ext_supported(".pdf")) self.assertFalse(is_file_ext_supported(".hsdfh"))