mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Cleans up and improves parser discovery testing, simplifies the determination of supported or not supported extensions and mime types
This commit is contained in:
parent
a340b9c8a1
commit
d19bf59f47
@ -6,12 +6,12 @@ import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from functools import cache
|
||||
from typing import Iterator
|
||||
from typing import Match
|
||||
from typing import Optional
|
||||
from typing import Set
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from documents.loggers import LoggingMixin
|
||||
@ -45,11 +45,20 @@ DATE_REGEX = re.compile(
|
||||
logger = logging.getLogger("paperless.parsing")
|
||||
|
||||
|
||||
def is_mime_type_supported(mime_type) -> bool:
|
||||
@cache
|
||||
def is_mime_type_supported(mime_type: str) -> bool:
|
||||
"""
|
||||
Returns True if the mime type is supported, False otherwise
|
||||
"""
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
|
||||
|
||||
def get_default_file_extension(mime_type) -> str:
|
||||
@cache
|
||||
def get_default_file_extension(mime_type: str) -> str:
|
||||
"""
|
||||
Returns the default file extension for a mimetype, or
|
||||
an empty string if it could not be determined
|
||||
"""
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str:
|
||||
return ""
|
||||
|
||||
|
||||
def is_file_ext_supported(ext) -> bool:
|
||||
@cache
|
||||
def is_file_ext_supported(ext: str) -> bool:
|
||||
"""
|
||||
Returns True if the file extension is supported, False otherwise
|
||||
TODO: Investigate why this really exists, why not use mimetype
|
||||
"""
|
||||
if ext:
|
||||
return ext.lower() in get_supported_file_extensions()
|
||||
else:
|
||||
@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]:
|
||||
|
||||
for mime_type in supported_mime_types:
|
||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||
# Python's stdlib might be behind, so also add what the parser
|
||||
# says is the default extension
|
||||
# This makes image/webp supported on Python < 3.11
|
||||
extensions.add(supported_mime_types[mime_type])
|
||||
|
||||
return extensions
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type):
|
||||
def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
|
||||
"""
|
||||
Returns the best parser (by weight) for the given mimetype or
|
||||
None if no parser exists
|
||||
"""
|
||||
|
||||
options = []
|
||||
|
||||
@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type):
|
||||
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
|
||||
|
||||
def get_parser_class(path):
|
||||
"""
|
||||
Determine the appropriate parser class based on the file
|
||||
"""
|
||||
|
||||
mime_type = magic.from_file(path, mime=True)
|
||||
|
||||
return get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
|
||||
def run_convert(
|
||||
input_file,
|
||||
output_file,
|
||||
|
@ -1,14 +1,8 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.test import override_settings
|
||||
from django.test import TestCase
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import get_default_file_extension
|
||||
from documents.parsers import get_parser_class
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import get_supported_file_extensions
|
||||
from documents.parsers import is_file_ext_supported
|
||||
@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
|
||||
def fake_magic_from_file(file, mime=False):
|
||||
|
||||
if mime:
|
||||
if os.path.splitext(file)[1] == ".pdf":
|
||||
return "application/pdf"
|
||||
else:
|
||||
return "unknown"
|
||||
else:
|
||||
return "A verbose string that describes the contents of the file"
|
||||
|
||||
|
||||
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
|
||||
class TestParserDiscovery(TestCase):
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test__get_parser_class_1_parser(self, m, *args):
|
||||
def test_get_parser_class_1_parser(self, m, *args):
|
||||
"""
|
||||
GIVEN:
|
||||
- Parser declared for a given mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Declared parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase):
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
|
||||
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test__get_parser_class_n_parsers(self, m, *args):
|
||||
def test_get_parser_class_n_parsers(self, m, *args):
|
||||
"""
|
||||
GIVEN:
|
||||
- Two parsers declared for a given mimetype
|
||||
- Second parser has a higher weight
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- Second parser class is returned
|
||||
"""
|
||||
|
||||
class DummyParser1:
|
||||
pass
|
||||
|
||||
@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase):
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
|
||||
self.assertEqual(
|
||||
get_parser_class_for_mime_type("application/pdf"),
|
||||
DummyParser2,
|
||||
)
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test__get_parser_class_0_parsers(self, m, *args):
|
||||
def test_get_parser_class_0_parsers(self, m, *args):
|
||||
"""
|
||||
GIVEN:
|
||||
- No parsers are declared
|
||||
WHEN:
|
||||
- Attempt to get parser for the mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
m.return_value = []
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
self.assertIsNone(get_parser_class("doc.pdf"))
|
||||
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def test_get_parser_class_no_valid_parser(self, m, *args):
|
||||
"""
|
||||
GIVEN:
|
||||
- No parser declared for a given mimetype
|
||||
- Parser declared for a different mimetype
|
||||
WHEN:
|
||||
- Attempt to get parser for the given mimetype
|
||||
THEN:
|
||||
- No parser class is returned
|
||||
"""
|
||||
|
||||
def fake_get_thumbnail(self, path, mimetype, file_name):
|
||||
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
|
||||
class DummyParser:
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(
|
||||
None,
|
||||
{
|
||||
"weight": 0,
|
||||
"parser": DummyParser,
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
def test_file_extensions(self):
|
||||
|
||||
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
|
||||
self.assertIn(ext, get_supported_file_extensions())
|
||||
self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
|
||||
self.assertEqual(get_default_file_extension("image/png"), ".png")
|
||||
self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
|
||||
self.assertEqual(get_default_file_extension("text/plain"), ".txt")
|
||||
self.assertEqual(get_default_file_extension("text/csv"), ".csv")
|
||||
supported_mimes_and_exts = [
|
||||
("application/pdf", ".pdf"),
|
||||
("image/png", ".png"),
|
||||
("image/jpeg", ".jpg"),
|
||||
("image/tiff", ".tif"),
|
||||
("image/webp", ".webp"),
|
||||
("text/plain", ".txt"),
|
||||
("text/csv", ".csv"),
|
||||
]
|
||||
|
||||
supported_exts = get_supported_file_extensions()
|
||||
|
||||
for mime_type, ext in supported_mimes_and_exts:
|
||||
self.assertIn(ext, supported_exts)
|
||||
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||
|
||||
# Test no parser declared still returns a an extension
|
||||
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
|
||||
|
||||
# Test invalid mimetype returns no extension
|
||||
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
|
||||
|
||||
self.assertIsInstance(
|
||||
@ -108,7 +156,7 @@ class TestParserAvailability(TestCase):
|
||||
get_parser_class_for_mime_type("text/plain")(logging_group=None),
|
||||
TextDocumentParser,
|
||||
)
|
||||
self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
|
||||
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
||||
|
||||
self.assertTrue(is_file_ext_supported(".pdf"))
|
||||
self.assertFalse(is_file_ext_supported(".hsdfh"))
|
||||
|
Loading…
x
Reference in New Issue
Block a user