Changed the way parsers are discovered. This also prepares for upcoming changes regarding content types and file types: parsers should declare what they support, and actual file extensions should not be hardcoded everywhere.

2026-02-26 01:09:34 -06:00 · 2020-11-16 23:53:12 +01:00
parent e30f0b274b
commit 9a48d6c577
7 changed files with 42 additions and 51 deletions
--- a/src/paperless_tesseract/signals.py
+++ b/src/paperless_tesseract/signals.py
@@ -3,21 +3,16 @@ import re
 from .parsers import RasterisedDocumentParser


-class ConsumerDeclaration:
+def tesseract_consumer_declaration(sender, **kwargs):
+    return {
+        "parser": RasterisedDocumentParser,
+        "weight": 0,
+        "test": tesseract_consumer_test
+    }

-    MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")

-    @classmethod
-    def handle(cls, sender, **kwargs):
-        return cls.test
+MATCHING_FILES = re.compile(r"^.*\.(pdf|jpe?g|gif|png|tiff?|pnm|bmp)$")

-    @classmethod
-    def test(cls, doc):

-        if cls.MATCHING_FILES.match(doc.lower()):
-            return {
-                "parser": RasterisedDocumentParser,
-                "weight": 0
-            }
-
-        return None
+def tesseract_consumer_test(doc):
+    return MATCHING_FILES.match(doc.lower())