mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	added file type checks to the parsers to prevent temporary files from being consumed. Also: parsers announce file types they wish to use as default for each mime type.
This commit is contained in:
		| @@ -9,10 +9,11 @@ from django.db import transaction | |||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
|  |  | ||||||
| from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||||
| from .file_handling import generate_filename, create_source_path_directory | from .file_handling import create_source_path_directory | ||||||
| from .loggers import LoggingMixin | from .loggers import LoggingMixin | ||||||
| from .models import Document, FileInfo, Correspondent, DocumentType, Tag | from .models import Document, FileInfo, Correspondent, DocumentType, Tag | ||||||
| from .parsers import ParseError, get_parser_class_for_mime_type | from .parsers import ParseError, get_parser_class_for_mime_type, \ | ||||||
|  |     get_supported_file_extensions | ||||||
| from .signals import ( | from .signals import ( | ||||||
|     document_consumption_finished, |     document_consumption_finished, | ||||||
|     document_consumption_started |     document_consumption_started | ||||||
| @@ -39,6 +40,21 @@ class Consumer(LoggingMixin): | |||||||
|             raise ConsumerError("Cannot consume {}: It is not a file".format( |             raise ConsumerError("Cannot consume {}: It is not a file".format( | ||||||
|                 self.path)) |                 self.path)) | ||||||
|  |  | ||||||
|  |     def pre_check_file_extension(self): | ||||||
|  |         extensions = get_supported_file_extensions() | ||||||
|  |         _, ext = os.path.splitext(self.filename) | ||||||
|  |  | ||||||
|  |         if not ext: | ||||||
|  |             raise ConsumerError( | ||||||
|  |                 f"Not consuming {self.filename}: File type unknown." | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         if ext not in extensions: | ||||||
|  |             raise ConsumerError( | ||||||
|  |                 f"Not consuming {self.filename}: File extension {ext} does " | ||||||
|  |                 f"not map to any known file type ({str(extensions)})" | ||||||
|  |             ) | ||||||
|  |  | ||||||
|     def pre_check_duplicate(self): |     def pre_check_duplicate(self): | ||||||
|         with open(self.path, "rb") as f: |         with open(self.path, "rb") as f: | ||||||
|             checksum = hashlib.md5(f.read()).hexdigest() |             checksum = hashlib.md5(f.read()).hexdigest() | ||||||
| @@ -80,6 +96,7 @@ class Consumer(LoggingMixin): | |||||||
|         # Make sure that preconditions for consuming the file are met. |         # Make sure that preconditions for consuming the file are met. | ||||||
|  |  | ||||||
|         self.pre_check_file_exists() |         self.pre_check_file_exists() | ||||||
|  |         self.pre_check_file_extension() | ||||||
|         self.pre_check_directories() |         self.pre_check_directories() | ||||||
|         self.pre_check_duplicate() |         self.pre_check_duplicate() | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,7 +1,6 @@ | |||||||
| # coding=utf-8 | # coding=utf-8 | ||||||
|  |  | ||||||
| import logging | import logging | ||||||
| import mimetypes |  | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
| @@ -12,6 +11,8 @@ from django.db import models | |||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| from django.utils.text import slugify | from django.utils.text import slugify | ||||||
|  |  | ||||||
|  | from documents.parsers import get_default_file_extension | ||||||
|  |  | ||||||
|  |  | ||||||
| class MatchingModel(models.Model): | class MatchingModel(models.Model): | ||||||
|  |  | ||||||
| @@ -230,8 +231,7 @@ class Document(models.Model): | |||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def file_type(self): |     def file_type(self): | ||||||
|         # TODO: this is not stable across python versions |         return get_default_file_extension(self.mime_type) | ||||||
|         return mimetypes.guess_extension(str(self.mime_type)) |  | ||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def thumbnail_path(self): |     def thumbnail_path(self): | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| import logging | import logging | ||||||
|  | import mimetypes | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import shutil | import shutil | ||||||
| @@ -42,6 +43,29 @@ def is_mime_type_supported(mime_type): | |||||||
|     return get_parser_class_for_mime_type(mime_type) is not None |     return get_parser_class_for_mime_type(mime_type) is not None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def get_default_file_extension(mime_type): | ||||||
|  |     for response in document_consumer_declaration.send(None): | ||||||
|  |         parser_declaration = response[1] | ||||||
|  |         supported_mime_types = parser_declaration["mime_types"] | ||||||
|  |  | ||||||
|  |         if mime_type in supported_mime_types: | ||||||
|  |             return supported_mime_types[mime_type] | ||||||
|  |  | ||||||
|  |     return None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def get_supported_file_extensions(): | ||||||
|  |     extensions = set() | ||||||
|  |     for response in document_consumer_declaration.send(None): | ||||||
|  |         parser_declaration = response[1] | ||||||
|  |         supported_mime_types = parser_declaration["mime_types"] | ||||||
|  |  | ||||||
|  |         for mime_type in supported_mime_types: | ||||||
|  |             extensions.update(mimetypes.guess_all_extensions(mime_type)) | ||||||
|  |  | ||||||
|  |     return extensions | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_parser_class_for_mime_type(mime_type): | def get_parser_class_for_mime_type(mime_type): | ||||||
|  |  | ||||||
|     options = [] |     options = [] | ||||||
|   | |||||||
| @@ -423,7 +423,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|         m = patcher.start() |         m = patcher.start() | ||||||
|         m.return_value = [(None, { |         m.return_value = [(None, { | ||||||
|             "parser": self.make_dummy_parser, |             "parser": self.make_dummy_parser, | ||||||
|             "mime_types": ["application/pdf"], |             "mime_types": {"application/pdf": ".pdf"}, | ||||||
|             "weight": 0 |             "weight": 0 | ||||||
|         })] |         })] | ||||||
|  |  | ||||||
| @@ -519,7 +519,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|         try: |         try: | ||||||
|             self.consumer.try_consume_file(self.get_test_file()) |             self.consumer.try_consume_file(self.get_test_file()) | ||||||
|         except ConsumerError as e: |         except ConsumerError as e: | ||||||
|             self.assertTrue(str(e).startswith("No parsers abvailable")) |             self.assertTrue("File extension .pdf does not map to any" in str(e)) | ||||||
|             return |             return | ||||||
|  |  | ||||||
|         self.fail("Should throw exception") |         self.fail("Should throw exception") | ||||||
| @@ -528,7 +528,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|     def testFaultyParser(self, m): |     def testFaultyParser(self, m): | ||||||
|         m.return_value = [(None, { |         m.return_value = [(None, { | ||||||
|             "parser": self.make_faulty_parser, |             "parser": self.make_faulty_parser, | ||||||
|             "mime_types": ["application/pdf"], |             "mime_types": {"application/pdf": ".pdf"}, | ||||||
|             "weight": 0 |             "weight": 0 | ||||||
|         })] |         })] | ||||||
|  |  | ||||||
|   | |||||||
| @@ -4,7 +4,10 @@ from unittest import mock | |||||||
|  |  | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
|  |  | ||||||
| from documents.parsers import get_parser_class | from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \ | ||||||
|  |     get_parser_class_for_mime_type | ||||||
|  | from paperless_tesseract.parsers import RasterisedDocumentParser | ||||||
|  | from paperless_text.parsers import TextDocumentParser | ||||||
|  |  | ||||||
|  |  | ||||||
| def fake_magic_from_file(file, mime=False): | def fake_magic_from_file(file, mime=False): | ||||||
| @@ -27,7 +30,7 @@ class TestParserDiscovery(TestCase): | |||||||
|             pass |             pass | ||||||
|  |  | ||||||
|         m.return_value = ( |         m.return_value = ( | ||||||
|             (None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}), |             (None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}), | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         self.assertEqual( |         self.assertEqual( | ||||||
| @@ -45,8 +48,8 @@ class TestParserDiscovery(TestCase): | |||||||
|             pass |             pass | ||||||
|  |  | ||||||
|         m.return_value = ( |         m.return_value = ( | ||||||
|             (None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}), |             (None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}), | ||||||
|             (None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}), |             (None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}), | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         self.assertEqual( |         self.assertEqual( | ||||||
| @@ -61,3 +64,21 @@ class TestParserDiscovery(TestCase): | |||||||
|             self.assertIsNone( |             self.assertIsNone( | ||||||
|                 get_parser_class("doc.pdf") |                 get_parser_class("doc.pdf") | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TestParserAvailability(TestCase): | ||||||
|  |  | ||||||
|  |     def test_file_extensions(self): | ||||||
|  |  | ||||||
|  |         for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]: | ||||||
|  |             self.assertIn(ext, get_supported_file_extensions()) | ||||||
|  |         self.assertEqual(get_default_file_extension('application/pdf'), ".pdf") | ||||||
|  |         self.assertEqual(get_default_file_extension('image/png'), ".png") | ||||||
|  |         self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg") | ||||||
|  |         self.assertEqual(get_default_file_extension('text/plain'), ".txt") | ||||||
|  |         self.assertEqual(get_default_file_extension('text/csv'), ".csv") | ||||||
|  |         self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), None) | ||||||
|  |  | ||||||
|  |         self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser) | ||||||
|  |         self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser) | ||||||
|  |         self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None) | ||||||
|   | |||||||
| @@ -5,9 +5,9 @@ def tesseract_consumer_declaration(sender, **kwargs): | |||||||
|     return { |     return { | ||||||
|         "parser": RasterisedDocumentParser, |         "parser": RasterisedDocumentParser, | ||||||
|         "weight": 0, |         "weight": 0, | ||||||
|         "mime_types": [ |         "mime_types": { | ||||||
|             "application/pdf", |             "application/pdf": ".pdf", | ||||||
|             "image/jpeg", |             "image/jpeg": ".jpg", | ||||||
|             "image/png" |             "image/png": ".png" | ||||||
|         ] |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -5,8 +5,8 @@ def text_consumer_declaration(sender, **kwargs): | |||||||
|     return { |     return { | ||||||
|         "parser": TextDocumentParser, |         "parser": TextDocumentParser, | ||||||
|         "weight": 10, |         "weight": 10, | ||||||
|         "mime_types": [ |         "mime_types": { | ||||||
|             "text/plain", |             "text/plain": ".txt", | ||||||
|             "text/comma-separated-values" |             "text/csv": ".csv", | ||||||
|         ] |         } | ||||||
|     } |     } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler