Cleans up and improves parser discovery testing, simplifies the determination of supported or not supported extensions and mime types

This commit is contained in:
Trenton H 2023-01-04 10:18:31 -08:00
parent a340b9c8a1
commit d19bf59f47
2 changed files with 109 additions and 49 deletions

View File

@ -6,12 +6,12 @@ import re
import shutil
import subprocess
import tempfile
from functools import cache
from typing import Iterator
from typing import Match
from typing import Optional
from typing import Set
import magic
from django.conf import settings
from django.utils import timezone
from documents.loggers import LoggingMixin
@ -45,11 +45,20 @@ DATE_REGEX = re.compile(
logger = logging.getLogger("paperless.parsing")
def is_mime_type_supported(mime_type) -> bool:
@cache
def is_mime_type_supported(mime_type: str) -> bool:
"""
Returns True if the mime type is supported, False otherwise
"""
return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type) -> str:
@cache
def get_default_file_extension(mime_type: str) -> str:
"""
Returns the default file extension for a mimetype, or
an empty string if it could not be determined
"""
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str:
return ""
def is_file_ext_supported(ext) -> bool:
@cache
def is_file_ext_supported(ext: str) -> bool:
"""
Returns True if the file extension is supported, False otherwise
TODO: Investigate why this really exists, why not use mimetype
"""
if ext:
return ext.lower() in get_supported_file_extensions()
else:
@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]:
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
# Python's stdlib might be behind, so also add what the parser
# says is the default extension
# This makes image/webp supported on Python < 3.11
extensions.add(supported_mime_types[mime_type])
return extensions
def get_parser_class_for_mime_type(mime_type):
def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
"""
Returns the best parser (by weight) for the given mimetype or
None if no parser exists
"""
options = []
@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type):
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def get_parser_class(path):
"""
Determine the appropriate parser class based on the file
"""
mime_type = magic.from_file(path, mime=True)
return get_parser_class_for_mime_type(mime_type)
def run_convert(
input_file,
output_file,

View File

@ -1,14 +1,8 @@
import os
import shutil
import tempfile
from tempfile import TemporaryDirectory
from unittest import mock
from django.test import override_settings
from django.test import TestCase
from documents.parsers import DocumentParser
from documents.parsers import get_default_file_extension
from documents.parsers import get_parser_class
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import get_supported_file_extensions
from documents.parsers import is_file_ext_supported
@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
def fake_magic_from_file(file, mime=False):
if mime:
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
else:
return "unknown"
else:
return "A verbose string that describes the contents of the file"
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_1_parser(self, m, *args):
def test_get_parser_class_1_parser(self, m, *args):
"""
GIVEN:
- Parser declared for a given mimetype
WHEN:
- Attempt to get parser for the mimetype
THEN:
- Declared parser class is returned
"""
class DummyParser:
pass
@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase):
),
)
self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_n_parsers(self, m, *args):
def test_get_parser_class_n_parsers(self, m, *args):
"""
GIVEN:
- Two parsers declared for a given mimetype
- Second parser has a higher weight
WHEN:
- Attempt to get parser for the mimetype
THEN:
- Second parser class is returned
"""
class DummyParser1:
pass
@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase):
),
)
self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
self.assertEqual(
get_parser_class_for_mime_type("application/pdf"),
DummyParser2,
)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_0_parsers(self, m, *args):
def test_get_parser_class_0_parsers(self, m, *args):
"""
GIVEN:
- No parsers are declared
WHEN:
- Attempt to get parser for the mimetype
THEN:
- No parser class is returned
"""
m.return_value = []
with TemporaryDirectory() as tmpdir:
self.assertIsNone(get_parser_class("doc.pdf"))
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_get_parser_class_no_valid_parser(self, m, *args):
"""
GIVEN:
- No parser declared for a given mimetype
- Parser declared for a different mimetype
WHEN:
- Attempt to get parser for the given mimetype
THEN:
- No parser class is returned
"""
def fake_get_thumbnail(self, path, mimetype, file_name):
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
class DummyParser:
pass
m.return_value = (
(
None,
{
"weight": 0,
"parser": DummyParser,
"mime_types": {"application/pdf": ".pdf"},
},
),
)
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
class TestParserAvailability(TestCase):
def test_file_extensions(self):
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
self.assertIn(ext, get_supported_file_extensions())
self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
self.assertEqual(get_default_file_extension("image/png"), ".png")
self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
self.assertEqual(get_default_file_extension("text/plain"), ".txt")
self.assertEqual(get_default_file_extension("text/csv"), ".csv")
supported_mimes_and_exts = [
("application/pdf", ".pdf"),
("image/png", ".png"),
("image/jpeg", ".jpg"),
("image/tiff", ".tif"),
("image/webp", ".webp"),
("text/plain", ".txt"),
("text/csv", ".csv"),
]
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
# Test no parser declared still returns a an extension
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
# Test invalid mimetype returns no extension
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
self.assertIsInstance(
@ -108,7 +156,7 @@ class TestParserAvailability(TestCase):
get_parser_class_for_mime_type("text/plain")(logging_group=None),
TextDocumentParser,
)
self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
self.assertTrue(is_file_ext_supported(".pdf"))
self.assertFalse(is_file_ext_supported(".hsdfh"))