Added a consume-start and consume-finish signal

This commit is contained in:
Daniel Quinn 2016-03-14 21:20:44 +00:00
parent 8f9e34078b
commit 1170139127
2 changed files with 21 additions and 2 deletions

View File

@ -26,6 +26,8 @@ from paperless.db import GnuPG
from .models import Correspondent, Tag, Document, Log
from .languages import ISO639
from .signals import (
document_consumption_started, document_consumption_finished)
class OCRError(Exception):
@ -118,22 +120,33 @@ class Consumer(object):
self.log("info", "Consuming {}".format(doc))
document_consumption_started.send(
sender=self.__class__, filename=doc)
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
imgs = self._get_greyscale(tempdir, doc)
thumbnail = self._get_thumbnail(tempdir, doc)
try:
text = self._get_ocr(imgs)
self._store(text, doc, thumbnail)
document = self._store(self._get_ocr(imgs), doc, thumbnail)
except OCRError as e:
self._ignore.append(doc)
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
self._cleanup_tempdir(tempdir)
continue
else:
self._cleanup_tempdir(tempdir)
self._cleanup_doc(doc)
document_consumption_finished.send(
sender=self.__class__, filename=document)
def _get_greyscale(self, tempdir, doc):
"""
Greyscale images are easier for Tesseract to OCR
@ -360,6 +373,8 @@ class Consumer(object):
self.log("info", "Completed")
return document
def _cleanup_tempdir(self, d):
self.log("debug", "Deleting directory {}".format(d))
shutil.rmtree(d)

4
src/documents/signals.py Normal file
View File

@ -0,0 +1,4 @@
from django.dispatch import Signal
document_consumption_started = Signal(providing_args=["filename"])
document_consumption_finished = Signal(providing_args=["document"])