Changed the way parsers are discovered. This also prepares for upcoming changes regarding content types and file types: parsers should declare what they support, and actual file extensions should not be hardcoded everywhere.

This commit is contained in:
Jonas Winkler 2020-11-16 23:53:12 +01:00
parent 70d8e8bc56
commit d2e22e3f27
7 changed files with 42 additions and 51 deletions

View File

@ -41,15 +41,16 @@ def get_parser_class(doc):
Determine the appropriate parser class based on the file Determine the appropriate parser class based on the file
""" """
parsers = []
for response in document_consumer_declaration.send(None):
parsers.append(response[1])
options = [] options = []
for parser in parsers:
result = parser(doc) # Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
if result:
options.append(result) for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
parser_test = parser_declaration["test"]
if parser_test(doc):
options.append(parser_declaration)
if not options: if not options:
return None return None

View File

@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase):
pass pass
m.return_value = ( m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser}), (None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
) )
self.assertEqual( self.assertEqual(
@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase):
pass pass
m.return_value = ( m.return_value = (
(None, lambda _: {"weight": 0, "parser": DummyParser1}), (None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
(None, lambda _: {"weight": 1, "parser": DummyParser2}), (None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
) )
self.assertEqual( self.assertEqual(
@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send") @mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_0_parsers(self, m, *args): def test__get_parser_class_0_parsers(self, m, *args):
m.return_value = ((None, lambda _: None),) m.return_value = []
with TemporaryDirectory() as tmpdir: with TemporaryDirectory() as tmpdir:
self.assertIsNone( self.assertIsNone(
get_parser_class("doc.pdf") get_parser_class("doc.pdf")

View File

@ -1,5 +1,7 @@
from django.apps import AppConfig from django.apps import AppConfig
from paperless_tesseract.signals import tesseract_consumer_declaration
class PaperlessTesseractConfig(AppConfig): class PaperlessTesseractConfig(AppConfig):
@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig):
from documents.signals import document_consumer_declaration from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration document_consumer_declaration.connect(tesseract_consumer_declaration)
document_consumer_declaration.connect(ConsumerDeclaration.handle)
AppConfig.ready(self) AppConfig.ready(self)

View File

@ -3,21 +3,16 @@ import re
from .parsers import RasterisedDocumentParser from .parsers import RasterisedDocumentParser
class ConsumerDeclaration: def tesseract_consumer_declaration(sender, **kwargs):
return {
"parser": RasterisedDocumentParser,
"weight": 0,
"test": tesseract_consumer_test
}
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
@classmethod MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
def handle(cls, sender, **kwargs):
return cls.test
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()): def tesseract_consumer_test(doc):
return { return MATCHING_FILES.match(doc.lower())
"parser": RasterisedDocumentParser,
"weight": 0
}
return None

View File

@ -1,6 +1,6 @@
from django.test import TestCase from django.test import TestCase
from ..signals import ConsumerDeclaration from paperless_tesseract.signals import tesseract_consumer_test
class SignalsTestCase(TestCase): class SignalsTestCase(TestCase):
@ -20,7 +20,7 @@ class SignalsTestCase(TestCase):
for prefix in prefixes: for prefix in prefixes:
for suffix in suffixes: for suffix in suffixes:
name = "{}.{}".format(prefix, suffix) name = "{}.{}".format(prefix, suffix)
self.assertTrue(ConsumerDeclaration.test(name)) self.assertTrue(tesseract_consumer_test(name))
def test_test_handles_various_file_names_false(self): def test_test_handles_various_file_names_false(self):
@ -30,7 +30,7 @@ class SignalsTestCase(TestCase):
for prefix in prefixes: for prefix in prefixes:
for suffix in suffixes: for suffix in suffixes:
name = "{}.{}".format(prefix, suffix) name = "{}.{}".format(prefix, suffix)
self.assertFalse(ConsumerDeclaration.test(name)) self.assertFalse(tesseract_consumer_test(name))
self.assertFalse(ConsumerDeclaration.test("")) self.assertFalse(tesseract_consumer_test(""))
self.assertFalse(ConsumerDeclaration.test("doc")) self.assertFalse(tesseract_consumer_test("doc"))

View File

@ -1,5 +1,7 @@
from django.apps import AppConfig from django.apps import AppConfig
from paperless_text.signals import text_consumer_declaration
class PaperlessTextConfig(AppConfig): class PaperlessTextConfig(AppConfig):
@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig):
from documents.signals import document_consumer_declaration from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration document_consumer_declaration.connect(text_consumer_declaration)
document_consumer_declaration.connect(ConsumerDeclaration.handle)
AppConfig.ready(self) AppConfig.ready(self)

View File

@ -3,21 +3,16 @@ import re
from .parsers import TextDocumentParser from .parsers import TextDocumentParser
class ConsumerDeclaration: def text_consumer_declaration(sender, **kwargs):
return {
"parser": TextDocumentParser,
"weight": 10,
"test": text_consumer_test
}
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
@classmethod MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
def handle(cls, sender, **kwargs):
return cls.test
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc.lower()): def text_consumer_test(doc):
return { return MATCHING_FILES.match(doc.lower())
"parser": TextDocumentParser,
"weight": 10
}
return None