mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Changed the way parsers are discovered. This also prepares for upcoming changes regarding content types and file types: parsers should declare what they support, and actual file extensions should not be hardcoded everywhere.
This commit is contained in:
		| @@ -1,5 +1,7 @@ | ||||
| from django.apps import AppConfig | ||||
|  | ||||
| from paperless_tesseract.signals import tesseract_consumer_declaration | ||||
|  | ||||
|  | ||||
| class PaperlessTesseractConfig(AppConfig): | ||||
|  | ||||
| @@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig): | ||||
|  | ||||
|         from documents.signals import document_consumer_declaration | ||||
|  | ||||
|         from .signals import ConsumerDeclaration | ||||
|  | ||||
|         document_consumer_declaration.connect(ConsumerDeclaration.handle) | ||||
|         document_consumer_declaration.connect(tesseract_consumer_declaration) | ||||
|  | ||||
|         AppConfig.ready(self) | ||||
|   | ||||
| @@ -3,21 +3,16 @@ import re | ||||
| from .parsers import RasterisedDocumentParser | ||||
|  | ||||
|  | ||||
| class ConsumerDeclaration: | ||||
| def tesseract_consumer_declaration(sender, **kwargs): | ||||
|     return { | ||||
|         "parser": RasterisedDocumentParser, | ||||
|         "weight": 0, | ||||
|         "test": tesseract_consumer_test | ||||
|     } | ||||
|  | ||||
|     MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") | ||||
|  | ||||
|     @classmethod | ||||
|     def handle(cls, sender, **kwargs): | ||||
|         return cls.test | ||||
| MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") | ||||
|  | ||||
|     @classmethod | ||||
|     def test(cls, doc): | ||||
|  | ||||
|         if cls.MATCHING_FILES.match(doc.lower()): | ||||
|             return { | ||||
|                 "parser": RasterisedDocumentParser, | ||||
|                 "weight": 0 | ||||
|             } | ||||
|  | ||||
|         return None | ||||
| def tesseract_consumer_test(doc): | ||||
|     return MATCHING_FILES.match(doc.lower()) | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| from django.test import TestCase | ||||
|  | ||||
| from ..signals import ConsumerDeclaration | ||||
| from paperless_tesseract.signals import tesseract_consumer_test | ||||
|  | ||||
|  | ||||
| class SignalsTestCase(TestCase): | ||||
| @@ -20,7 +20,7 @@ class SignalsTestCase(TestCase): | ||||
|         for prefix in prefixes: | ||||
|             for suffix in suffixes: | ||||
|                 name = "{}.{}".format(prefix, suffix) | ||||
|                 self.assertTrue(ConsumerDeclaration.test(name)) | ||||
|                 self.assertTrue(tesseract_consumer_test(name)) | ||||
|  | ||||
|     def test_test_handles_various_file_names_false(self): | ||||
|  | ||||
| @@ -30,7 +30,7 @@ class SignalsTestCase(TestCase): | ||||
|         for prefix in prefixes: | ||||
|             for suffix in suffixes: | ||||
|                 name = "{}.{}".format(prefix, suffix) | ||||
|                 self.assertFalse(ConsumerDeclaration.test(name)) | ||||
|                 self.assertFalse(tesseract_consumer_test(name)) | ||||
|  | ||||
|         self.assertFalse(ConsumerDeclaration.test("")) | ||||
|         self.assertFalse(ConsumerDeclaration.test("doc")) | ||||
|         self.assertFalse(tesseract_consumer_test("")) | ||||
|         self.assertFalse(tesseract_consumer_test("doc")) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler