New logging appears to work

2026-01-18 22:14:22 -06:00 · 2016-02-27 20:18:50 +00:00
parent e149baec4e
commit 2fe9b0cbc1
16 changed files with 346 additions and 188 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,5 +1,8 @@
 import datetime
+import logging
 import tempfile
+import uuid
+
 from multiprocessing.pool import Pool

 import itertools
@@ -19,10 +22,9 @@ from django.utils import timezone
 from django.template.defaultfilters import slugify
 from pyocr.tesseract import TesseractError

-from logger.models import Log
 from paperless.db import GnuPG

-from .models import Sender, Tag, Document
+from .models import Sender, Tag, Document, Log
 from .languages import ISO639


@@ -67,6 +69,8 @@ class Consumer(object):
    def __init__(self, verbosity=1):

        self.verbosity = verbosity
+        self.logger = logging.getLogger(__name__)
+        self.logging_group = None

        try:
            os.makedirs(self.SCRATCH)
@@ -86,6 +90,12 @@ class Consumer(object):
            raise ConsumerError(
                "Consumption directory {} does not exist".format(self.CONSUME))

+    def log(self, level, message):
+        getattr(self.logger, level)(message, extra={
+            "group": self.logging_group,
+            "component": Log.COMPONENT_CONSUMER
+        })
+
    def consume(self):

        for doc in os.listdir(self.CONSUME):
@@ -104,7 +114,9 @@ class Consumer(object):
            if self._is_ready(doc):
                continue

-            Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
+            self.logging_group = uuid.uuid4()
+
+            self.log("info", "Consuming {}".format(doc))

            tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
            pngs = self._get_greyscale(tempdir, doc)
@@ -114,8 +126,7 @@ class Consumer(object):
                self._store(text, doc)
            except OCRError:
                self._ignore.append(doc)
-                Log.error(
-                    "OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
+                self.log("error", "OCR FAILURE: {}".format(doc))
                self._cleanup_tempdir(tempdir)
                continue
            else:
@@ -124,10 +135,7 @@ class Consumer(object):

    def _get_greyscale(self, tempdir, doc):

-        Log.debug(
-            "Generating greyscale image from {}".format(doc),
-            Log.COMPONENT_CONSUMER
-        )
+        self.log("info", "Generating greyscale image from {}".format(doc))

        png = os.path.join(tempdir, "convert-%04d.jpg")

@@ -143,18 +151,13 @@ class Consumer(object):

        return sorted(filter(lambda __: os.path.isfile(__), pngs))

-    @staticmethod
-    def _guess_language(text):
+    def _guess_language(self, text):
        try:
            guess = langdetect.detect(text)
-            Log.debug(
-                "Language detected: {}".format(guess),
-                Log.COMPONENT_CONSUMER
-            )
+            self.log("debug", "Language detected: {}".format(guess))
            return guess
        except Exception as e:
-            Log.warning(
-                "Language detection error: {}".format(e), Log.COMPONENT_MAIL)
+            self.log("warning", "Language detection error: {}".format(e))

    def _get_ocr(self, pngs):
        """
@@ -165,7 +168,7 @@ class Consumer(object):
        if not pngs:
            raise OCRError

-        Log.debug("OCRing the document", Log.COMPONENT_CONSUMER)
+        self.log("info", "OCRing the document")

        # Since the division gets rounded down by int, this calculation works
        # for every edge-case, i.e. 1
@@ -175,12 +178,12 @@ class Consumer(object):
        guessed_language = self._guess_language(raw_text)

        if not guessed_language or guessed_language not in ISO639:
-            Log.warning("Language detection failed!", Log.COMPONENT_CONSUMER)
+            self.log("warning", "Language detection failed!")
            if settings.FORGIVING_OCR:
-                Log.warning(
+                self.log(
+                    "warning",
                    "As FORGIVING_OCR is enabled, we're going to make the "
-                    "best with what we have.",
-                    Log.COMPONENT_CONSUMER
+                    "best with what we have."
                )
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                return raw_text
@@ -194,12 +197,12 @@ class Consumer(object):
            return self._ocr(pngs, ISO639[guessed_language])
        except pyocr.pyocr.tesseract.TesseractError:
            if settings.FORGIVING_OCR:
-                Log.warning(
+                self.log(
+                    "warning",
                    "OCR for {} failed, but we're going to stick with what "
                    "we've got since FORGIVING_OCR is enabled.".format(
                        guessed_language
-                    ),
-                    Log.COMPONENT_CONSUMER
+                    )
                )
                raw_text = self._assemble_ocr_sections(pngs, middle, raw_text)
                return raw_text
@@ -222,28 +225,15 @@ class Consumer(object):
        if not pngs:
            return ""

-        Log.debug("Parsing for {}".format(lang), Log.COMPONENT_CONSUMER)
+        self.log("info", "Parsing for {}".format(lang))

        with Pool(processes=self.THREADS) as pool:
-            r = pool.map(
-                self.image_to_string, itertools.product(pngs, [lang]))
+            r = pool.map(image_to_string, itertools.product(pngs, [lang]))
            r = " ".join(r)

        # Strip out excess white space to allow matching to go smoother
        return re.sub(r"\s+", " ", r)

-    def image_to_string(self, args):
-        png, lang = args
-        ocr = pyocr.get_available_tools()[0]
-        with Image.open(os.path.join(self.SCRATCH, png)) as f:
-            if ocr.can_detect_orientation():
-                try:
-                    orientation = ocr.detect_orientation(f, lang=lang)
-                    f = f.rotate(orientation["angle"], expand=1)
-                except TesseractError:
-                    pass
-            return ocr.image_to_string(f, lang=lang)
-
    def _guess_attributes_from_name(self, parseable):
        """
        We use a crude naming convention to make handling the sender, title,
@@ -301,7 +291,7 @@ class Consumer(object):

        stats = os.stat(doc)

-        Log.debug("Saving record to database", Log.COMPONENT_CONSUMER)
+        self.log("debug", "Saving record to database")

        document = Document.objects.create(
            sender=sender,
@@ -316,23 +306,22 @@ class Consumer(object):

        if relevant_tags:
            tag_names = ", ".join([t.slug for t in relevant_tags])
-            Log.debug(
-                "Tagging with {}".format(tag_names), Log.COMPONENT_CONSUMER)
+            self.log("debug", "Tagging with {}".format(tag_names))
            document.tags.add(*relevant_tags)

        with open(doc, "rb") as unencrypted:
            with open(document.source_path, "wb") as encrypted:
-                Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
+                self.log("debug", "Encrypting")
                encrypted.write(GnuPG.encrypted(unencrypted))

-    @staticmethod
-    def _cleanup_tempdir(d):
-        Log.debug("Deleting directory {}".format(d), Log.COMPONENT_CONSUMER)
+        self.log("info", "Completed")
+
+    def _cleanup_tempdir(self, d):
+        self.log("debug", "Deleting directory {}".format(d))
        shutil.rmtree(d)

-    @staticmethod
-    def _cleanup_doc(doc):
-        Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
+    def _cleanup_doc(self, doc):
+        self.log("debug", "Deleting document {}".format(doc))
        os.unlink(doc)

    def _is_ready(self, doc):
@@ -350,3 +339,23 @@ class Consumer(object):
        self.stats[doc] = t

        return False
+
+
+def image_to_string(args):
+    """
+    I have no idea why, but if this function were a method of Consumer, it
+    would explode with:
+
+      `TypeError: cannot serialize '_io.TextIOWrapper' object`.
+    """
+
+    png, lang = args
+    ocr = pyocr.get_available_tools()[0]
+    with Image.open(os.path.join(Consumer.SCRATCH, png)) as f:
+        if ocr.can_detect_orientation():
+            try:
+                orientation = ocr.detect_orientation(f, lang=lang)
+                f = f.rotate(orientation["angle"], expand=1)
+            except TesseractError:
+                pass
+        return ocr.image_to_string(f, lang=lang)