Merge branch 'pitkley-fix/secure-temporary-files'

This commit is contained in:
Daniel Quinn 2016-02-17 00:10:54 +00:00
commit 52f242574f

View File

@ -1,15 +1,16 @@
import datetime import datetime
import glob import tempfile
from multiprocessing.pool import Pool from multiprocessing.pool import Pool
import itertools import itertools
import langdetect import langdetect
import os import os
import random
import re import re
import subprocess import subprocess
import pyocr import pyocr
import shutil
from PIL import Image from PIL import Image
@ -111,34 +112,35 @@ class Consumer(object):
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
pngs = self._get_greyscale(doc) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
pngs = self._get_greyscale(tempdir, doc)
try: try:
text = self._get_ocr(pngs) text = self._get_ocr(pngs)
self._store(text, doc)
except OCRError: except OCRError:
self._ignore.append(doc) self._ignore.append(doc)
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
continue continue
finally:
self._cleanup(tempdir, doc)
self._store(text, doc) def _get_greyscale(self, tempdir, doc):
self._cleanup(pngs, doc)
def _get_greyscale(self, doc):
Log.debug( Log.debug(
"Generating greyscale image from {}".format(doc), "Generating greyscale image from {}".format(doc),
Log.COMPONENT_CONSUMER Log.COMPONENT_CONSUMER
) )
i = random.randint(1000000, 9999999) png = os.path.join(tempdir, "convert-%04d.jpg")
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen(( subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8", self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", doc, png "-type", "grayscale", doc, png
)).wait() )).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")]
return sorted(filter(lambda f: os.path.isfile(f), pngs))
@staticmethod @staticmethod
def _guess_language(text): def _guess_language(text):
@ -303,14 +305,14 @@ class Consumer(object):
Log.debug("Encrypting", Log.COMPONENT_CONSUMER) Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
encrypted.write(GnuPG.encrypted(unencrypted)) encrypted.write(GnuPG.encrypted(unencrypted))
def _cleanup(self, pngs, doc): def _cleanup(self, tempdir, doc):
# Remove temporary directory recursively
Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER)
shutil.rmtree(tempdir)
png_glob = os.path.join( # Remove doc
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
os.unlink(doc)
for f in list(glob.glob(png_glob)) + [doc]:
Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
os.unlink(f)
def _is_ready(self, doc): def _is_ready(self, doc):
""" """