Restructured the consumer into a loop and added a check for scans in-progress

This commit is contained in:
Daniel Quinn 2015-12-21 02:44:24 +00:00
parent 855ee64097
commit 802e9d7045

View File

@ -4,6 +4,7 @@ import random
import re
import shutil
import subprocess
import time
import pyocr
@ -35,8 +36,11 @@ class Command(BaseCommand):
MEDIA_IMG = os.path.join(settings.MEDIA_ROOT, "documents", "img")
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.stats = {}
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
@ -45,18 +49,32 @@ class Command(BaseCommand):
self._setup()
try:
while True:
self.loop()
time.sleep(10)
print(".")
except KeyboardInterrupt:
print("Exiting")
def loop(self):
for pdf in os.listdir(self.CONSUME):
if not os.path.isfile(os.path.join(self.CONSUME, pdf)):
pdf = os.path.join(self.CONSUME, pdf)
if not os.path.isfile(pdf):
continue
if not pdf.endswith(".pdf"):
continue
if self._is_ready(pdf):
continue
if self.verbosity > 1:
print("Consuming {}".format(pdf))
pdf = os.path.join(self.CONSUME, pdf)
pngs = self._get_greyscale(pdf)
jpgs = self._get_colour(pdf)
text = self._get_ocr(pngs)
@ -71,6 +89,22 @@ class Command(BaseCommand):
except FileExistsError:
pass
def _is_ready(self, pdf):
"""
Detect whether `pdf` is ready to consume or if it's still being written
to by the scanner.
"""
t = os.stat(pdf).st_mtime
if self.stats.get(pdf) == t:
del(self.stats[pdf])
return True
self.stats[pdf] = t
return False
def _get_greyscale(self, pdf):
i = random.randint(1000000, 4999999)
@ -104,13 +138,28 @@ class Command(BaseCommand):
def _store(self, text, jpgs, pdf):
doc = Document.objects.create(content=text)
sender, title = self._parse_file_name(pdf)
doc = Document.objects.create(sender=sender, title=title, content=text)
shutil.move(jpgs[0], os.path.join(
self.MEDIA_IMG, "{:07}.jpg".format(doc.pk)))
shutil.move(pdf, os.path.join(
self.MEDIA_PDF, "{:07}.pdf".format(doc.pk)))
def _parse_file_name(self, pdf):
"""
We use a crude naming convention to make handling the sender and title
easier:
"sender - title.pdf"
"""
m = re.match(self.PARSER_REGEX, pdf)
if m:
return m.group(1), m.group(2)
return "", ""
def _cleanup(self, pngs, jpgs):
jpg_glob = os.path.join(