mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Cleans up and improves parser discovery testing, simplifies the determination of supported or not supported extensions and mime types
This commit is contained in:
		| @@ -6,12 +6,12 @@ import re | ||||
| import shutil | ||||
| import subprocess | ||||
| import tempfile | ||||
| from functools import cache | ||||
| from typing import Iterator | ||||
| from typing import Match | ||||
| from typing import Optional | ||||
| from typing import Set | ||||
|  | ||||
| import magic | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from documents.loggers import LoggingMixin | ||||
| @@ -45,11 +45,20 @@ DATE_REGEX = re.compile( | ||||
| logger = logging.getLogger("paperless.parsing") | ||||
|  | ||||
|  | ||||
| def is_mime_type_supported(mime_type) -> bool: | ||||
| @cache | ||||
| def is_mime_type_supported(mime_type: str) -> bool: | ||||
|     """ | ||||
|     Returns True if the mime type is supported, False otherwise | ||||
|     """ | ||||
|     return get_parser_class_for_mime_type(mime_type) is not None | ||||
|  | ||||
|  | ||||
| def get_default_file_extension(mime_type) -> str: | ||||
| @cache | ||||
| def get_default_file_extension(mime_type: str) -> str: | ||||
|     """ | ||||
|     Returns the default file extension for a mimetype, or | ||||
|     an empty string if it could not be determined | ||||
|     """ | ||||
|     for response in document_consumer_declaration.send(None): | ||||
|         parser_declaration = response[1] | ||||
|         supported_mime_types = parser_declaration["mime_types"] | ||||
| @@ -64,7 +73,12 @@ def get_default_file_extension(mime_type) -> str: | ||||
|         return "" | ||||
|  | ||||
|  | ||||
| def is_file_ext_supported(ext) -> bool: | ||||
| @cache | ||||
| def is_file_ext_supported(ext: str) -> bool: | ||||
|     """ | ||||
|     Returns True if the file extension is supported, False otherwise | ||||
|     TODO: Investigate why this really exists, why not use mimetype | ||||
|     """ | ||||
|     if ext: | ||||
|         return ext.lower() in get_supported_file_extensions() | ||||
|     else: | ||||
| @@ -79,11 +93,19 @@ def get_supported_file_extensions() -> Set[str]: | ||||
|  | ||||
|         for mime_type in supported_mime_types: | ||||
|             extensions.update(mimetypes.guess_all_extensions(mime_type)) | ||||
|             # Python's stdlib might be behind, so also add what the parser | ||||
|             # says is the default extension | ||||
|             # This makes image/webp supported on Python < 3.11 | ||||
|             extensions.add(supported_mime_types[mime_type]) | ||||
|  | ||||
|     return extensions | ||||
|  | ||||
|  | ||||
| def get_parser_class_for_mime_type(mime_type): | ||||
| def get_parser_class_for_mime_type(mime_type: str) -> Optional["DocumentParser"]: | ||||
|     """ | ||||
|     Returns the best parser (by weight) for the given mimetype or | ||||
|     None if no parser exists | ||||
|     """ | ||||
|  | ||||
|     options = [] | ||||
|  | ||||
| @@ -103,16 +125,6 @@ def get_parser_class_for_mime_type(mime_type): | ||||
|     return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"] | ||||
|  | ||||
|  | ||||
| def get_parser_class(path): | ||||
|     """ | ||||
|     Determine the appropriate parser class based on the file | ||||
|     """ | ||||
|  | ||||
|     mime_type = magic.from_file(path, mime=True) | ||||
|  | ||||
|     return get_parser_class_for_mime_type(mime_type) | ||||
|  | ||||
|  | ||||
| def run_convert( | ||||
|     input_file, | ||||
|     output_file, | ||||
|   | ||||
| @@ -1,14 +1,8 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from tempfile import TemporaryDirectory | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import override_settings | ||||
| from django.test import TestCase | ||||
| from documents.parsers import DocumentParser | ||||
| from documents.parsers import get_default_file_extension | ||||
| from documents.parsers import get_parser_class | ||||
| from documents.parsers import get_parser_class_for_mime_type | ||||
| from documents.parsers import get_supported_file_extensions | ||||
| from documents.parsers import is_file_ext_supported | ||||
| @@ -16,21 +10,18 @@ from paperless_tesseract.parsers import RasterisedDocumentParser | ||||
| from paperless_text.parsers import TextDocumentParser | ||||
|  | ||||
|  | ||||
| def fake_magic_from_file(file, mime=False): | ||||
|  | ||||
|     if mime: | ||||
|         if os.path.splitext(file)[1] == ".pdf": | ||||
|             return "application/pdf" | ||||
|         else: | ||||
|             return "unknown" | ||||
|     else: | ||||
|         return "A verbose string that describes the contents of the file" | ||||
|  | ||||
|  | ||||
| @mock.patch("documents.parsers.magic.from_file", fake_magic_from_file) | ||||
| class TestParserDiscovery(TestCase): | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def test__get_parser_class_1_parser(self, m, *args): | ||||
|     def test_get_parser_class_1_parser(self, m, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Parser declared for a given mimetype | ||||
|         WHEN: | ||||
|             - Attempt to get parser for the mimetype | ||||
|         THEN: | ||||
|             - Declared parser class is returned | ||||
|         """ | ||||
|  | ||||
|         class DummyParser: | ||||
|             pass | ||||
|  | ||||
| @@ -45,10 +36,20 @@ class TestParserDiscovery(TestCase): | ||||
|             ), | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual(get_parser_class("doc.pdf"), DummyParser) | ||||
|         self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser) | ||||
|  | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def test__get_parser_class_n_parsers(self, m, *args): | ||||
|     def test_get_parser_class_n_parsers(self, m, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Two parsers declared for a given mimetype | ||||
|             - Second parser has a higher weight | ||||
|         WHEN: | ||||
|             - Attempt to get parser for the mimetype | ||||
|         THEN: | ||||
|             - Second parser class is returned | ||||
|         """ | ||||
|  | ||||
|         class DummyParser1: | ||||
|             pass | ||||
|  | ||||
| @@ -74,30 +75,77 @@ class TestParserDiscovery(TestCase): | ||||
|             ), | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual(get_parser_class("doc.pdf"), DummyParser2) | ||||
|         self.assertEqual( | ||||
|             get_parser_class_for_mime_type("application/pdf"), | ||||
|             DummyParser2, | ||||
|         ) | ||||
|  | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def test__get_parser_class_0_parsers(self, m, *args): | ||||
|     def test_get_parser_class_0_parsers(self, m, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - No parsers are declared | ||||
|         WHEN: | ||||
|             - Attempt to get parser for the mimetype | ||||
|         THEN: | ||||
|             - No parser class is returned | ||||
|         """ | ||||
|         m.return_value = [] | ||||
|         with TemporaryDirectory() as tmpdir: | ||||
|             self.assertIsNone(get_parser_class("doc.pdf")) | ||||
|             self.assertIsNone(get_parser_class_for_mime_type("application/pdf")) | ||||
|  | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def test_get_parser_class_no_valid_parser(self, m, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - No parser declared for a given mimetype | ||||
|             - Parser declared for a different mimetype | ||||
|         WHEN: | ||||
|             - Attempt to get parser for the given mimetype | ||||
|         THEN: | ||||
|             - No parser class is returned | ||||
|         """ | ||||
|  | ||||
| def fake_get_thumbnail(self, path, mimetype, file_name): | ||||
|     return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") | ||||
|         class DummyParser: | ||||
|             pass | ||||
|  | ||||
|         m.return_value = ( | ||||
|             ( | ||||
|                 None, | ||||
|                 { | ||||
|                     "weight": 0, | ||||
|                     "parser": DummyParser, | ||||
|                     "mime_types": {"application/pdf": ".pdf"}, | ||||
|                 }, | ||||
|             ), | ||||
|         ) | ||||
|  | ||||
|         self.assertIsNone(get_parser_class_for_mime_type("image/tiff")) | ||||
|  | ||||
|  | ||||
| class TestParserAvailability(TestCase): | ||||
|     def test_file_extensions(self): | ||||
|  | ||||
|         for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]: | ||||
|             self.assertIn(ext, get_supported_file_extensions()) | ||||
|         self.assertEqual(get_default_file_extension("application/pdf"), ".pdf") | ||||
|         self.assertEqual(get_default_file_extension("image/png"), ".png") | ||||
|         self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg") | ||||
|         self.assertEqual(get_default_file_extension("text/plain"), ".txt") | ||||
|         self.assertEqual(get_default_file_extension("text/csv"), ".csv") | ||||
|         supported_mimes_and_exts = [ | ||||
|             ("application/pdf", ".pdf"), | ||||
|             ("image/png", ".png"), | ||||
|             ("image/jpeg", ".jpg"), | ||||
|             ("image/tiff", ".tif"), | ||||
|             ("image/webp", ".webp"), | ||||
|             ("text/plain", ".txt"), | ||||
|             ("text/csv", ".csv"), | ||||
|         ] | ||||
|  | ||||
|         supported_exts = get_supported_file_extensions() | ||||
|  | ||||
|         for mime_type, ext in supported_mimes_and_exts: | ||||
|             self.assertIn(ext, supported_exts) | ||||
|             self.assertEqual(get_default_file_extension(mime_type), ext) | ||||
|  | ||||
|         # Test no parser declared still returns a an extension | ||||
|         self.assertEqual(get_default_file_extension("application/zip"), ".zip") | ||||
|  | ||||
|         # Test invalid mimetype returns no extension | ||||
|         self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "") | ||||
|  | ||||
|         self.assertIsInstance( | ||||
| @@ -108,7 +156,7 @@ class TestParserAvailability(TestCase): | ||||
|             get_parser_class_for_mime_type("text/plain")(logging_group=None), | ||||
|             TextDocumentParser, | ||||
|         ) | ||||
|         self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None) | ||||
|         self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf")) | ||||
|  | ||||
|         self.assertTrue(is_file_ext_supported(".pdf")) | ||||
|         self.assertFalse(is_file_ext_supported(".hsdfh")) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H