backend that supports asgi and status update sockets with channels

2025-12-18 01:41:14 -06:00 · 2020-11-07 11:30:45 +01:00
parent e45208bf01
commit 572e40ca27
7 changed files with 613 additions and 92 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -5,6 +5,8 @@ import os
 import re
 import uuid

+from asgiref.sync import async_to_sync
+from channels.layers import get_channel_layer
 from django.conf import settings
 from django.db import transaction
 from django.utils import timezone
@@ -33,6 +35,17 @@ class Consumer:
      5. Delete the document and image(s)
    """

+    def _send_progress(self, filename, current_progress, max_progress, status, message, document_id=None):
+        payload = {
+            'filename': os.path.basename(filename),
+            'current_progress': current_progress,
+            'max_progress': max_progress,
+            'status': status,
+            'message': message,
+            'document_id': document_id
+        }
+        async_to_sync(self.channel_layer.group_send)("status_updates", {'type': 'status_update', 'data': payload})
+
    def __init__(self, consume=settings.CONSUMPTION_DIR,
                 scratch=settings.SCRATCH_DIR):

@@ -44,6 +57,8 @@ class Consumer:

        self.classifier = DocumentClassifier()

+        self.channel_layer = get_channel_layer()
+
        os.makedirs(self.scratch, exist_ok=True)

        self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
@@ -60,7 +75,6 @@ class Consumer:
            raise ConsumerError(
                "Consumption directory {} does not exist".format(self.consume))

-
    def log(self, level, message):
        getattr(self.logger, level)(message, extra={
            "group": self.logging_group
@@ -88,6 +102,7 @@ class Consumer:

        self.log("info", "Consuming {}".format(doc))

+
        parser_class = get_parser_class(doc)
        if not parser_class:
            self.log(
@@ -96,6 +111,7 @@ class Consumer:
        else:
            self.log("info", "Parser: {}".format(parser_class.__name__))

+        self._send_progress(file, 0, 100, 'WORKING', 'Consumption started')

        document_consumption_started.send(
            sender=self.__class__,
@@ -103,20 +119,37 @@ class Consumer:
            logging_group=self.logging_group
        )

-        document_parser = parser_class(doc, self.logging_group)
+        def progress_callback(current_progress, max_progress, message):
+            # recalculate progress to be within 20 and 80
+            p = int((current_progress / max_progress) * 60 + 20)
+            self._send_progress(file, p, 100, "WORKING", message)
+
+        document_parser = parser_class(doc, self.logging_group, progress_callback)

        try:
            self.log("info", "Generating thumbnail for {}...".format(doc))
+            self._send_progress(file, 10, 100, 'WORKING',
+                                'Generating thumbnail...')
            thumbnail = document_parser.get_optimised_thumbnail()
+            self._send_progress(file, 20, 100, 'WORKING',
+                                'Getting text from document...')
+            text = document_parser.get_text()
+            self._send_progress(file, 80, 100, 'WORKING',
+                                'Getting date from document...')
            date = document_parser.get_date()
+            self._send_progress(file, 85, 100, 'WORKING',
+                                'Storing the document...')
            document = self._store(
-                document_parser.get_text(),
+                text,
                doc,
                thumbnail,
                date
            )
        except ParseError as e:
            self.log("fatal", "PARSE FAILURE for {}: {}".format(doc, e))
+            self._send_progress(file, 100, 100, 'FAILED',
+                                "Failed: {}".format(e))
+
            document_parser.cleanup()
            return False
        else:
@@ -136,12 +169,17 @@ class Consumer:
            except (FileNotFoundError, IncompatibleClassifierVersionError) as e:
                logging.getLogger(__name__).warning("Cannot classify documents: {}.".format(e))

+            self._send_progress(file, 90, 100, 'WORKING',
+                                'Performing post-consumption tasks...')
+
            document_consumption_finished.send(
                sender=self.__class__,
                document=document,
                logging_group=self.logging_group,
                classifier=classifier
            )
+            self._send_progress(file, 100, 100, 'SUCCESS',
+                                'Finished.', document.id)
            return True

    def _store(self, text, doc, thumbnail, date):
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -106,11 +106,12 @@ class DocumentParser:
    `paperless_tesseract.parsers` for inspiration.
    """

-    def __init__(self, path, logging_group):
+    def __init__(self, path, logging_group, progress_callback):
        self.document_path = path
        self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
        self.logger = logging.getLogger(__name__)
        self.logging_group = logging_group
+        self.progress_callback = progress_callback

    def get_thumbnail(self):
        """
--- a/src/paperless/asgi.py
+++ b/src/paperless/asgi.py
@@ -0,0 +1,37 @@
+import json
+import os
+
+from asgiref.sync import async_to_sync
+from channels.auth import AuthMiddlewareStack
+from channels.generic.websocket import WebsocketConsumer
+from channels.routing import ProtocolTypeRouter, URLRouter
+from django.core.asgi import get_asgi_application
+from django.urls import re_path
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'paperless.settings')
+
+
+class StatusConsumer(WebsocketConsumer):
+    def connect(self):
+        self.accept()
+        async_to_sync(self.channel_layer.group_add)('status_updates', self.channel_name)
+
+    def disconnect(self, close_code):
+        async_to_sync(self.channel_layer.group_discard)('status_updates', self.channel_name)
+
+    def status_update(self, event):
+        self.send(json.dumps(event['data']))
+
+
+websocket_urlpatterns = [
+    re_path(r'ws/status/$', StatusConsumer.as_asgi()),
+]
+
+application = ProtocolTypeRouter({
+    "http": get_asgi_application(),
+    "websocket": AuthMiddlewareStack(
+        URLRouter(
+            websocket_urlpatterns
+        )
+    ),
+})
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -69,6 +69,8 @@ INSTALLED_APPS = [
    "rest_framework.authtoken",
    "django_filters",

+    "channels",
+
 ]

 REST_FRAMEWORK = {
@@ -98,6 +100,7 @@ LOGIN_URL = "admin:login"
 FORCE_SCRIPT_NAME = os.getenv("PAPERLESS_FORCE_SCRIPT_NAME")

 WSGI_APPLICATION = 'paperless.wsgi.application'
+ASGI_APPLICATION = "paperless.asgi.application"

 STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", "/static/")

@@ -299,3 +302,12 @@ FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
 FILENAME_PARSE_TRANSFORMS = []
 for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))
+
+CHANNEL_LAYERS = {
+    "default": {
+        "BACKEND": "channels_redis.core.RedisChannelLayer",
+        "CONFIG": {
+            "hosts": [("127.0.0.1", 6379)],
+        },
+    },
+}
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -27,8 +27,8 @@ class RasterisedDocumentParser(DocumentParser):
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """

-    def __init__(self, path, logging_group):
-        super().__init__(path, logging_group)
+    def __init__(self, path, logging_group, progress_callback):
+        super().__init__(path, logging_group, progress_callback)
        self._text = None

    def get_thumbnail(self):
@@ -91,6 +91,7 @@ class RasterisedDocumentParser(DocumentParser):
            self._text = get_text_from_pdf(self.document_path)
            return self._text

+        self.progress_callback(0,1,"Making greyscale images.")
        images = self._get_greyscale()

        if not images:
@@ -100,8 +101,10 @@ class RasterisedDocumentParser(DocumentParser):

            sample_page_index = int(len(images) / 2)
            self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index+1, len(images)))
+            self.progress_callback(0.4, 1, "Language Detection.")
            sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
            guessed_language = self._guess_language(sample_page_text)
+            self.progress_callback(0.6, 1, "OCR all the pages.")

            if not guessed_language or guessed_language not in ISO639:
                self.log("warning", "Language detection failed.")
@@ -117,7 +120,7 @@ class RasterisedDocumentParser(DocumentParser):

            else:
                self.log("info", "Detected language: {}".format(guessed_language))
-                ocr_pages = self._ocr(images, ISO639[guessed_language])
+                ocr_pages = self._ocr(images, ISO639[guessed_language], report_progress=True)

            self.log("info", "OCR completed.")
            self._text = strip_excess_whitespace(" ".join(ocr_pages))
@@ -151,6 +154,8 @@ class RasterisedDocumentParser(DocumentParser):

        self.log("info", "Running unpaper on {} pages...".format(len(pnms)))

+        self.progress_callback(0.2,1, "Running unpaper on {} pages...".format(len(pnms)))
+
        # Run unpaper in parallel on converted images
        with Pool(processes=settings.OCR_THREADS) as pool:
            pnms = pool.map(run_unpaper, pnms)
@@ -165,11 +170,16 @@ class RasterisedDocumentParser(DocumentParser):
            self.log('debug', "Language detection failed with: {}".format(e))
            return None

-    def _ocr(self, imgs, lang):
+    def _ocr(self, imgs, lang, report_progress=False):
        self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
+        r = []
        with Pool(processes=settings.OCR_THREADS) as pool:
-            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
-            return r
+            # r = pool.map(image_to_string, itertools.product(imgs, [lang]))
+            for i, page in enumerate(pool.imap(image_to_string, itertools.product(imgs, [lang]))):
+                if report_progress:
+                    self.progress_callback(0.6 + (i / len(imgs)) * 0.4, 1, "OCR'ed {} pages".format(i+1))
+                r += [page]
+        return r

    def _complete_ocr_default_language(self, images, sample_page_index, sample_page):
        """
@@ -182,14 +192,13 @@ class RasterisedDocumentParser(DocumentParser):
        del images_copy[sample_page_index]
        if images_copy:
            self.log('info', 'Continuing ocr with default language.')
-            ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
+            ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE, report_progress=True)
            ocr_pages.insert(sample_page_index, sample_page)
            return ocr_pages
        else:
            return [sample_page]


-
 def strip_excess_whitespace(text):
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(