mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Changed the way parsers are discovered. This also prepares for upcoming changes regarding content types and file types: parsers should declare what they support, and actual file extensions should not be hardcoded everywhere.
This commit is contained in:
parent
70d8e8bc56
commit
d2e22e3f27
@ -41,15 +41,16 @@ def get_parser_class(doc):
|
|||||||
Determine the appropriate parser class based on the file
|
Determine the appropriate parser class based on the file
|
||||||
"""
|
"""
|
||||||
|
|
||||||
parsers = []
|
|
||||||
for response in document_consumer_declaration.send(None):
|
|
||||||
parsers.append(response[1])
|
|
||||||
|
|
||||||
options = []
|
options = []
|
||||||
for parser in parsers:
|
|
||||||
result = parser(doc)
|
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
|
||||||
if result:
|
|
||||||
options.append(result)
|
for response in document_consumer_declaration.send(None):
|
||||||
|
parser_declaration = response[1]
|
||||||
|
parser_test = parser_declaration["test"]
|
||||||
|
|
||||||
|
if parser_test(doc):
|
||||||
|
options.append(parser_declaration)
|
||||||
|
|
||||||
if not options:
|
if not options:
|
||||||
return None
|
return None
|
||||||
|
@ -14,7 +14,7 @@ class TestParserDiscovery(TestCase):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
m.return_value = (
|
m.return_value = (
|
||||||
(None, lambda _: {"weight": 0, "parser": DummyParser}),
|
(None, {"weight": 0, "parser": DummyParser, "test": lambda _: True}),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@ -32,8 +32,8 @@ class TestParserDiscovery(TestCase):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
m.return_value = (
|
m.return_value = (
|
||||||
(None, lambda _: {"weight": 0, "parser": DummyParser1}),
|
(None, {"weight": 0, "parser": DummyParser1, "test": lambda _: True}),
|
||||||
(None, lambda _: {"weight": 1, "parser": DummyParser2}),
|
(None, {"weight": 1, "parser": DummyParser2, "test": lambda _: True}),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@ -43,7 +43,7 @@ class TestParserDiscovery(TestCase):
|
|||||||
|
|
||||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||||
def test__get_parser_class_0_parsers(self, m, *args):
|
def test__get_parser_class_0_parsers(self, m, *args):
|
||||||
m.return_value = ((None, lambda _: None),)
|
m.return_value = []
|
||||||
with TemporaryDirectory() as tmpdir:
|
with TemporaryDirectory() as tmpdir:
|
||||||
self.assertIsNone(
|
self.assertIsNone(
|
||||||
get_parser_class("doc.pdf")
|
get_parser_class("doc.pdf")
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
from paperless_tesseract.signals import tesseract_consumer_declaration
|
||||||
|
|
||||||
|
|
||||||
class PaperlessTesseractConfig(AppConfig):
|
class PaperlessTesseractConfig(AppConfig):
|
||||||
|
|
||||||
@ -9,8 +11,6 @@ class PaperlessTesseractConfig(AppConfig):
|
|||||||
|
|
||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
from .signals import ConsumerDeclaration
|
document_consumer_declaration.connect(tesseract_consumer_declaration)
|
||||||
|
|
||||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
|
||||||
|
|
||||||
AppConfig.ready(self)
|
AppConfig.ready(self)
|
||||||
|
@ -3,21 +3,16 @@ import re
|
|||||||
from .parsers import RasterisedDocumentParser
|
from .parsers import RasterisedDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class ConsumerDeclaration:
|
def tesseract_consumer_declaration(sender, **kwargs):
|
||||||
|
return {
|
||||||
|
"parser": RasterisedDocumentParser,
|
||||||
|
"weight": 0,
|
||||||
|
"test": tesseract_consumer_test
|
||||||
|
}
|
||||||
|
|
||||||
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
|
||||||
|
|
||||||
@classmethod
|
MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")
|
||||||
def handle(cls, sender, **kwargs):
|
|
||||||
return cls.test
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def test(cls, doc):
|
|
||||||
|
|
||||||
if cls.MATCHING_FILES.match(doc.lower()):
|
def tesseract_consumer_test(doc):
|
||||||
return {
|
return MATCHING_FILES.match(doc.lower())
|
||||||
"parser": RasterisedDocumentParser,
|
|
||||||
"weight": 0
|
|
||||||
}
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
|
|
||||||
from ..signals import ConsumerDeclaration
|
from paperless_tesseract.signals import tesseract_consumer_test
|
||||||
|
|
||||||
|
|
||||||
class SignalsTestCase(TestCase):
|
class SignalsTestCase(TestCase):
|
||||||
@ -20,7 +20,7 @@ class SignalsTestCase(TestCase):
|
|||||||
for prefix in prefixes:
|
for prefix in prefixes:
|
||||||
for suffix in suffixes:
|
for suffix in suffixes:
|
||||||
name = "{}.{}".format(prefix, suffix)
|
name = "{}.{}".format(prefix, suffix)
|
||||||
self.assertTrue(ConsumerDeclaration.test(name))
|
self.assertTrue(tesseract_consumer_test(name))
|
||||||
|
|
||||||
def test_test_handles_various_file_names_false(self):
|
def test_test_handles_various_file_names_false(self):
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ class SignalsTestCase(TestCase):
|
|||||||
for prefix in prefixes:
|
for prefix in prefixes:
|
||||||
for suffix in suffixes:
|
for suffix in suffixes:
|
||||||
name = "{}.{}".format(prefix, suffix)
|
name = "{}.{}".format(prefix, suffix)
|
||||||
self.assertFalse(ConsumerDeclaration.test(name))
|
self.assertFalse(tesseract_consumer_test(name))
|
||||||
|
|
||||||
self.assertFalse(ConsumerDeclaration.test(""))
|
self.assertFalse(tesseract_consumer_test(""))
|
||||||
self.assertFalse(ConsumerDeclaration.test("doc"))
|
self.assertFalse(tesseract_consumer_test("doc"))
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
from paperless_text.signals import text_consumer_declaration
|
||||||
|
|
||||||
|
|
||||||
class PaperlessTextConfig(AppConfig):
|
class PaperlessTextConfig(AppConfig):
|
||||||
|
|
||||||
@ -9,8 +11,6 @@ class PaperlessTextConfig(AppConfig):
|
|||||||
|
|
||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
from .signals import ConsumerDeclaration
|
document_consumer_declaration.connect(text_consumer_declaration)
|
||||||
|
|
||||||
document_consumer_declaration.connect(ConsumerDeclaration.handle)
|
|
||||||
|
|
||||||
AppConfig.ready(self)
|
AppConfig.ready(self)
|
||||||
|
@ -3,21 +3,16 @@ import re
|
|||||||
from .parsers import TextDocumentParser
|
from .parsers import TextDocumentParser
|
||||||
|
|
||||||
|
|
||||||
class ConsumerDeclaration:
|
def text_consumer_declaration(sender, **kwargs):
|
||||||
|
return {
|
||||||
|
"parser": TextDocumentParser,
|
||||||
|
"weight": 10,
|
||||||
|
"test": text_consumer_test
|
||||||
|
}
|
||||||
|
|
||||||
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
|
||||||
|
|
||||||
@classmethod
|
MATCHING_FILES = re.compile(r"^.*\.(te?xt|md|csv)$")
|
||||||
def handle(cls, sender, **kwargs):
|
|
||||||
return cls.test
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def test(cls, doc):
|
|
||||||
|
|
||||||
if cls.MATCHING_FILES.match(doc.lower()):
|
def text_consumer_test(doc):
|
||||||
return {
|
return MATCHING_FILES.match(doc.lower())
|
||||||
"parser": TextDocumentParser,
|
|
||||||
"weight": 10
|
|
||||||
}
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user