mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-11 10:00:48 -05:00
Cleans up and improves parser discovery testing, simplifies the determination of supported or not supported extensions and mime types
This commit is contained in:
parent
a340b9c8a1
commit
d19bf59f47
@ -6,12 +6,12 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from functools import cache
|
||||||
from typing import Iterator
|
from typing import Iterator
|
||||||
from typing import Match
|
from typing import Match
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from typing import Set
|
from typing import Set
|
||||||
|
|
||||||
import magic
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from documents.loggers import LoggingMixin
|
from documents.loggers import LoggingMixin
|
||||||
@ -45,11 +45,20 @@ DATE_REGEX = re.compile(
|
|||||||
logger = logging.getLogger("paperless.parsing")
|
logger = logging.getLogger("paperless.parsing")
|
||||||
|
|
||||||
|
|
||||||
def is_mime_type_supported(mime_type) -> bool:
|
@cache
|
||||||
|
def is_mime_type_supported(mime_type: str) -> bool:
|
||||||
|
"""
|
||||||
|
Returns True if the mime type is supported, False otherwise
|
||||||
|
"""
|
||||||
return get_parser_class_for_mime_type(mime_type) is not None
|
return get_parser_class_for_mime_type(mime_type) is not None
|
||||||
|
|
||||||
|
|
||||||
def get_default_file_extension(mime_type) -> str:
|
@cache
|
||||||
|
def get_default_file_extension(mime_type: str) -> str:
|
||||||
|
"""
|
||||||
|
Returns the default file extension for a mimetype, or
|
||||||
|
an empty string if it could not be determined
|
||||||
|
"""
|
||||||
for response in document_consumer_declaration.send(None):
|
for response in document_consumer_declaration.send(None):
|
||||||
parser_declaration = response[1]
|
parser_declaration = response[1]
|
||||||
supported_mime_types = parser_declaration["mime_types"]
|
supported_mime_types = parser_declaration["mime_types"]
|
||||||
@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def is_file_ext_supported(ext) -> bool:
|
@cache
|
||||||
|
def is_file_ext_supported(ext: str) -> bool:
|
||||||
|
"""
|
||||||
|
Returns True if the file extension is supported, False otherwise
|
||||||
|
TODO: Investigate why this really exists, why not use mimetype
|
||||||
|
"""
|
||||||
if ext:
|
if ext:
|
||||||
return ext.lower() in get_supported_file_extensions()
|
return ext.lower() in get_supported_file_extensions()
|
||||||
else:
|
else:
|
||||||
@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]:
|
|||||||
|
|
||||||
for mime_type in supported_mime_types:
|
for mime_type in supported_mime_types:
|
||||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||||
|
# Python's stdlib might be behind, so also add what the parser
|
||||||
|
# says is the default extension
|
||||||
|
# This makes image/webp supported on Python < 3.11
|
||||||
|
extensions.add(supported_mime_types[mime_type])
|
||||||
|
|
||||||
return extensions
|
return extensions
|
||||||
|
|
||||||
|
|
||||||
def get_parser_class_for_mime_type(mime_type):
|
def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]:
|
||||||
|
"""
|
||||||
|
Returns the best parser (by weight) for the given mimetype or
|
||||||
|
None if no parser exists
|
||||||
|
"""
|
||||||
|
|
||||||
options = []
|
options = []
|
||||||
|
|
||||||
@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type):
|
|||||||
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||||
|
|
||||||
|
|
||||||
def get_parser_class(path):
|
|
||||||
"""
|
|
||||||
Determine the appropriate parser class based on the file
|
|
||||||
"""
|
|
||||||
|
|
||||||
mime_type = magic.from_file(path, mime=True)
|
|
||||||
|
|
||||||
return get_parser_class_for_mime_type(mime_type)
|
|
||||||
|
|
||||||
|
|
||||||
def run_convert(
|
def run_convert(
|
||||||
input_file,
|
input_file,
|
||||||
output_file,
|
output_file,
|
||||||
|
@ -1,14 +1,8 @@
|
|||||||
import os
|
|
||||||
import shutil
|
|
||||||
import tempfile
|
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
from django.test import override_settings
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from documents.parsers import DocumentParser
|
|
||||||
from documents.parsers import get_default_file_extension
|
from documents.parsers import get_default_file_extension
|
||||||
from documents.parsers import get_parser_class
|
|
||||||
from documents.parsers import get_parser_class_for_mime_type
|
from documents.parsers import get_parser_class_for_mime_type
|
||||||
from documents.parsers import get_supported_file_extensions
|
from documents.parsers import get_supported_file_extensions
|
||||||
from documents.parsers import is_file_ext_supported
|
from documents.parsers import is_file_ext_supported
|
||||||
@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser
|
|||||||
from paperless_text.parsers import TextDocumentParser
|
from paperless_text.parsers import TextDocumentParser
|
||||||
|
|
||||||
|
|
||||||
def fake_magic_from_file(file, mime=False):
|
|
||||||
|
|
||||||
if mime:
|
|
||||||
if os.path.splitext(file)[1] == ".pdf":
|
|
||||||
return "application/pdf"
|
|
||||||
else:
|
|
||||||
return "unknown"
|
|
||||||
else:
|
|
||||||
return "A verbose string that describes the contents of the file"
|
|
||||||
|
|
||||||
|
|
||||||
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
|
|
||||||
class TestParserDiscovery(TestCase):
|
class TestParserDiscovery(TestCase):
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
def test__get_parser_class_1_parser(self, m, *args):
|
def test_get_parser_class_1_parser(self, m, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Parser declared for a given mimetype
|
||||||
|
WHEN:
|
||||||
|
- Attempt to get parser for the mimetype
|
||||||
|
THEN:
|
||||||
|
- Declared parser class is returned
|
||||||
|
"""
|
||||||
|
|
||||||
class DummyParser:
|
class DummyParser:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
|
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
def test__get_parser_class_n_parsers(self, m, *args):
|
def test_get_parser_class_n_parsers(self, m, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- Two parsers declared for a given mimetype
|
||||||
|
- Second parser has a higher weight
|
||||||
|
WHEN:
|
||||||
|
- Attempt to get parser for the mimetype
|
||||||
|
THEN:
|
||||||
|
- Second parser class is returned
|
||||||
|
"""
|
||||||
|
|
||||||
class DummyParser1:
|
class DummyParser1:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
|
self.assertEqual(
|
||||||
|
get_parser_class_for_mime_type("application/pdf"),
|
||||||
|
DummyParser2,
|
||||||
|
)
|
||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
def test__get_parser_class_0_parsers(self, m, *args):
|
def test_get_parser_class_0_parsers(self, m, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- No parsers are declared
|
||||||
|
WHEN:
|
||||||
|
- Attempt to get parser for the mimetype
|
||||||
|
THEN:
|
||||||
|
- No parser class is returned
|
||||||
|
"""
|
||||||
m.return_value = []
|
m.return_value = []
|
||||||
with TemporaryDirectory() as tmpdir:
|
with TemporaryDirectory() as tmpdir:
|
||||||
self.assertIsNone(get_parser_class("doc.pdf"))
|
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
||||||
|
|
||||||
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
|
def test_get_parser_class_no_valid_parser(self, m, *args):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- No parser declared for a given mimetype
|
||||||
|
- Parser declared for a different mimetype
|
||||||
|
WHEN:
|
||||||
|
- Attempt to get parser for the given mimetype
|
||||||
|
THEN:
|
||||||
|
- No parser class is returned
|
||||||
|
"""
|
||||||
|
|
||||||
def fake_get_thumbnail(self, path, mimetype, file_name):
|
class DummyParser:
|
||||||
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
|
pass
|
||||||
|
|
||||||
|
m.return_value = (
|
||||||
|
(
|
||||||
|
None,
|
||||||
|
{
|
||||||
|
"weight": 0,
|
||||||
|
"parser": DummyParser,
|
||||||
|
"mime_types": {"application/pdf": ".pdf"},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
||||||
|
|
||||||
|
|
||||||
class TestParserAvailability(TestCase):
|
class TestParserAvailability(TestCase):
|
||||||
def test_file_extensions(self):
|
def test_file_extensions(self):
|
||||||
|
|
||||||
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
|
supported_mimes_and_exts = [
|
||||||
self.assertIn(ext, get_supported_file_extensions())
|
("application/pdf", ".pdf"),
|
||||||
self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
|
("image/png", ".png"),
|
||||||
self.assertEqual(get_default_file_extension("image/png"), ".png")
|
("image/jpeg", ".jpg"),
|
||||||
self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
|
("image/tiff", ".tif"),
|
||||||
self.assertEqual(get_default_file_extension("text/plain"), ".txt")
|
("image/webp", ".webp"),
|
||||||
self.assertEqual(get_default_file_extension("text/csv"), ".csv")
|
("text/plain", ".txt"),
|
||||||
|
("text/csv", ".csv"),
|
||||||
|
]
|
||||||
|
|
||||||
|
supported_exts = get_supported_file_extensions()
|
||||||
|
|
||||||
|
for mime_type, ext in supported_mimes_and_exts:
|
||||||
|
self.assertIn(ext, supported_exts)
|
||||||
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
||||||
|
|
||||||
|
# Test no parser declared still returns a an extension
|
||||||
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
|
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
|
||||||
|
|
||||||
|
# Test invalid mimetype returns no extension
|
||||||
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
|
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
|
||||||
|
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
@ -108,7 +156,7 @@ class TestParserAvailability(TestCase):
|
|||||||
get_parser_class_for_mime_type("text/plain")(logging_group=None),
|
get_parser_class_for_mime_type("text/plain")(logging_group=None),
|
||||||
TextDocumentParser,
|
TextDocumentParser,
|
||||||
)
|
)
|
||||||
self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
|
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
||||||
|
|
||||||
self.assertTrue(is_file_ext_supported(".pdf"))
|
self.assertTrue(is_file_ext_supported(".pdf"))
|
||||||
self.assertFalse(is_file_ext_supported(".hsdfh"))
|
self.assertFalse(is_file_ext_supported(".hsdfh"))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user