From d2e22e3f27709d6f17c7b48c65665c5dde5f99c9 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Mon, 16 Nov 2020 23:53:12 +0100 Subject: [PATCH] Changed the way parsers are discovered. This also prepares for upcoming changes regarding content types and file types: parsers should declare what they support, and actual file extensions should not be hardcoded everywhere. --- src/documents/parsers.py | 17 +++++++------- src/documents/tests/test_parsers.py | 8 +++---- src/paperless_tesseract/apps.py | 6 ++--- src/paperless_tesseract/signals.py | 23 ++++++++----------- src/paperless_tesseract/tests/test_signals.py | 10 ++++---- src/paperless_text/apps.py | 6 ++--- src/paperless_text/signals.py | 23 ++++++++----------- 7 files changed, 42 insertions(+), 51 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index c33c1bbd4..600e4fc93 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -41,15 +41,16 @@ def get_parser_class(doc): Determine the appropriate parser class based on the file """ - parsers = [] - for response in document_consumer_declaration.send(None): - parsers.append(response[1]) - options = [] - for parser in parsers: - result = parser(doc) - if result: - options.append(result) + + # Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser. + + for response in document_consumer_declaration.send(None): + parser_declaration = response[1] + parser_test = parser_declaration["test"] + + if parser_test(doc): + options.append(parser_declaration) if not options: return None diff --git a/src/documents/tests/test_parsers.py b/src/documents/tests/test_parsers.py index f49d6ca4d..5896f3ba3 100644 --- a/src/documents/tests/test_parsers.py +++ b/src/documents/tests/test_parsers.py @@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase): pass m.return_value = ( - (None, lambda _: {"weight": 0, "parser": DummyParser}), + (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}), ) self.assertEqual( @@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase): pass m.return_value = ( - (None, lambda _: {"weight": 0, "parser": DummyParser1}), - (None, lambda _: {"weight": 1, "parser": DummyParser2}), + (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}), + (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}), ) self.assertEqual( @@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase): @mock.patch("documents.parsers.document_consumer_declaration.send") def test__get_parser_class_0_parsers(self, m, *args): - m.return_value = ((None, lambda _: None),) + m.return_value = [] with TemporaryDirectory() as tmpdir: self.assertIsNone( get_parser_class("doc.pdf") diff --git a/src/paperless_tesseract/apps.py b/src/paperless_tesseract/apps.py index bdb430bea..67b90f006 100644 --- a/src/paperless_tesseract/apps.py +++ b/src/paperless_tesseract/apps.py @@ -1,5 +1,7 @@ from django.apps import AppConfig +from paperless_tesseract.signals import tesseract_consumer_declaration + class PaperlessTesseractConfig(AppConfig): @@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig): from documents.signals import document_consumer_declaration - from .signals import ConsumerDeclaration - - document_consumer_declaration.connect(ConsumerDeclaration.handle) + document_consumer_declaration.connect(tesseract_consumer_declaration) AppConfig.ready(self) diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py index 237f15c52..3fc6c2a11 100644 --- a/src/paperless_tesseract/signals.py +++ b/src/paperless_tesseract/signals.py @@ -3,21 +3,16 @@ import re from .parsers import RasterisedDocumentParser -class ConsumerDeclaration: +def tesseract_consumer_declaration(sender, **kwargs): + return { + "parser": RasterisedDocumentParser, + "weight": 0, + "test": tesseract_consumer_test + } - MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") - @classmethod - def handle(cls, sender, **kwargs): - return cls.test +MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$") - @classmethod - def test(cls, doc): - if cls.MATCHING_FILES.match(doc.lower()): - return { - "parser": RasterisedDocumentParser, - "weight": 0 - } - - return None +def tesseract_consumer_test(doc): + return MATCHING_FILES.match(doc.lower()) diff --git a/src/paperless_tesseract/tests/test_signals.py b/src/paperless_tesseract/tests/test_signals.py index b5ff4da59..354557732 100644 --- a/src/paperless_tesseract/tests/test_signals.py +++ b/src/paperless_tesseract/tests/test_signals.py @@ -1,6 +1,6 @@ from django.test import TestCase -from ..signals import ConsumerDeclaration +from paperless_tesseract.signals import tesseract_consumer_test class SignalsTestCase(TestCase): @@ -20,7 +20,7 @@ class SignalsTestCase(TestCase): for prefix in prefixes: for suffix in suffixes: name = "{}.{}".format(prefix, suffix) - self.assertTrue(ConsumerDeclaration.test(name)) + self.assertTrue(tesseract_consumer_test(name)) def test_test_handles_various_file_names_false(self): @@ -30,7 +30,7 @@ class SignalsTestCase(TestCase): for prefix in prefixes: for suffix in suffixes: name = "{}.{}".format(prefix, suffix) - self.assertFalse(ConsumerDeclaration.test(name)) + self.assertFalse(tesseract_consumer_test(name)) - self.assertFalse(ConsumerDeclaration.test("")) - self.assertFalse(ConsumerDeclaration.test("doc")) + self.assertFalse(tesseract_consumer_test("")) + self.assertFalse(tesseract_consumer_test("doc")) diff --git a/src/paperless_text/apps.py b/src/paperless_text/apps.py index 389167368..1acc361aa 100644 --- a/src/paperless_text/apps.py +++ b/src/paperless_text/apps.py @@ -1,5 +1,7 @@ from django.apps import AppConfig +from paperless_text.signals import text_consumer_declaration + class PaperlessTextConfig(AppConfig): @@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig): from documents.signals import document_consumer_declaration - from .signals import ConsumerDeclaration - - document_consumer_declaration.connect(ConsumerDeclaration.handle) + document_consumer_declaration.connect(text_consumer_declaration) AppConfig.ready(self) diff --git a/src/paperless_text/signals.py b/src/paperless_text/signals.py index ae5a005e1..784bfd45d 100644 --- a/src/paperless_text/signals.py +++ b/src/paperless_text/signals.py @@ -3,21 +3,16 @@ import re from .parsers import TextDocumentParser -class ConsumerDeclaration: +def text_consumer_declaration(sender, **kwargs): + return { + "parser": TextDocumentParser, + "weight": 10, + "test": text_consumer_test + } - MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") - @classmethod - def handle(cls, sender, **kwargs): - return cls.test +MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$") - @classmethod - def test(cls, doc): - if cls.MATCHING_FILES.match(doc.lower()): - return { - "parser": TextDocumentParser, - "weight": 10 - } - - return None +def text_consumer_test(doc): + return MATCHING_FILES.match(doc.lower())