From 11701391279b17995bd2becd7cc742f8ba3ff3aa Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Mon, 14 Mar 2016 21:20:44 +0000 Subject: [PATCH] Added a consume-start and consume-finish signal --- src/documents/consumer.py | 19 +++++++++++++++++-- src/documents/signals.py | 4 ++++ 2 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 src/documents/signals.py diff --git a/src/documents/consumer.py b/src/documents/consumer.py index fbdbbc276..244383211 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -26,6 +26,8 @@ from paperless.db import GnuPG from .models import Correspondent, Tag, Document, Log from .languages import ISO639 +from .signals import ( + document_consumption_started, document_consumption_finished) class OCRError(Exception): @@ -118,22 +120,33 @@ class Consumer(object): self.log("info", "Consuming {}".format(doc)) + document_consumption_started.send( + sender=self.__class__, filename=doc) + tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH) imgs = self._get_greyscale(tempdir, doc) thumbnail = self._get_thumbnail(tempdir, doc) try: - text = self._get_ocr(imgs) - self._store(text, doc, thumbnail) + + document = self._store(self._get_ocr(imgs), doc, thumbnail) + except OCRError as e: + self._ignore.append(doc) self.log("error", "OCR FAILURE for {}: {}".format(doc, e)) self._cleanup_tempdir(tempdir) + continue + else: + self._cleanup_tempdir(tempdir) self._cleanup_doc(doc) + document_consumption_finished.send( + sender=self.__class__, filename=document) + def _get_greyscale(self, tempdir, doc): """ Greyscale images are easier for Tesseract to OCR @@ -360,6 +373,8 @@ class Consumer(object): self.log("info", "Completed") + return document + def _cleanup_tempdir(self, d): self.log("debug", "Deleting directory {}".format(d)) shutil.rmtree(d) diff --git a/src/documents/signals.py b/src/documents/signals.py new file mode 100644 index 000000000..257a20d46 --- /dev/null +++ b/src/documents/signals.py @@ -0,0 +1,4 @@ +from django.dispatch import Signal + +document_consumption_started = Signal(providing_args=["filename"]) +document_consumption_finished = Signal(providing_args=["document"])