paperless-ngx/src/documents/tests/test_parsers.py

228 lines
7.1 KiB
Python

from tempfile import TemporaryDirectory
from unittest import mock
from django.apps import apps
from django.test import override_settings
from django.test import TestCase
from documents.parsers import get_default_file_extension
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import get_supported_file_extensions
from documents.parsers import is_file_ext_supported
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
from paperless_tika.parsers import TikaDocumentParser
class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_get_parser_class_1_parser(self, m, *args):
"""
GIVEN:
- Parser declared for a given mimetype
WHEN:
- Attempt to get parser for the mimetype
THEN:
- Declared parser class is returned
"""
class DummyParser:
pass
m.return_value = (
(
None,
{
"weight": 0,
"parser": DummyParser,
"mime_types": {"application/pdf": ".pdf"},
},
),
)
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_get_parser_class_n_parsers(self, m, *args):
"""
GIVEN:
- Two parsers declared for a given mimetype
- Second parser has a higher weight
WHEN:
- Attempt to get parser for the mimetype
THEN:
- Second parser class is returned
"""
class DummyParser1:
pass
class DummyParser2:
pass
m.return_value = (
(
None,
{
"weight": 0,
"parser": DummyParser1,
"mime_types": {"application/pdf": ".pdf"},
},
),
(
None,
{
"weight": 1,
"parser": DummyParser2,
"mime_types": {"application/pdf": ".pdf"},
},
),
)
self.assertEqual(
get_parser_class_for_mime_type("application/pdf"),
DummyParser2,
)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_get_parser_class_0_parsers(self, m, *args):
"""
GIVEN:
- No parsers are declared
WHEN:
- Attempt to get parser for the mimetype
THEN:
- No parser class is returned
"""
m.return_value = []
with TemporaryDirectory():
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_get_parser_class_no_valid_parser(self, m, *args):
"""
GIVEN:
- No parser declared for a given mimetype
- Parser declared for a different mimetype
WHEN:
- Attempt to get parser for the given mimetype
THEN:
- No parser class is returned
"""
class DummyParser:
pass
m.return_value = (
(
None,
{
"weight": 0,
"parser": DummyParser,
"mime_types": {"application/pdf": ".pdf"},
},
),
)
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
class TestParserAvailability(TestCase):
def test_tesseract_parser(self):
"""
GIVEN:
- Various mime types
WHEN:
- The parser class is instantiated
THEN:
- The Tesseract based parser is return
"""
supported_mimes_and_exts = [
("application/pdf", ".pdf"),
("image/png", ".png"),
("image/jpeg", ".jpg"),
("image/tiff", ".tif"),
("image/webp", ".webp"),
]
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),
RasterisedDocumentParser,
)
def test_text_parser(self):
"""
GIVEN:
- Various mime types of a text form
WHEN:
- The parser class is instantiated
THEN:
- The text based parser is return
"""
supported_mimes_and_exts = [
("text/plain", ".txt"),
("text/csv", ".csv"),
]
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),
TextDocumentParser,
)
def test_tika_parser(self):
"""
GIVEN:
- Various mime types of a office document form
WHEN:
- The parser class is instantiated
THEN:
- The Tika/Gotenberg based parser is return
"""
supported_mimes_and_exts = [
("application/vnd.oasis.opendocument.text", ".odt"),
("text/rtf", ".rtf"),
("application/msword", ".doc"),
(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".docx",
),
]
# Force the app ready to notice the settings override
with override_settings(TIKA_ENABLED=True, INSTALLED_APPS=["paperless_tika"]):
app = apps.get_app_config("paperless_tika")
app.ready()
supported_exts = get_supported_file_extensions()
for mime_type, ext in supported_mimes_and_exts:
self.assertIn(ext, supported_exts)
self.assertEqual(get_default_file_extension(mime_type), ext)
self.assertIsInstance(
get_parser_class_for_mime_type(mime_type)(logging_group=None),
TikaDocumentParser,
)
def test_no_parser_for_mime(self):
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
def test_default_extension(self):
# Test no parser declared still returns a an extension
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
# Test invalid mimetype returns no extension
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
def test_file_extension_support(self):
self.assertTrue(is_file_ext_supported(".pdf"))
self.assertFalse(is_file_ext_supported(".hsdfh"))
self.assertFalse(is_file_ext_supported(""))