Safely and non-randomly create scratch directory

Creating the scratch-files in `_get_grayscale` using a random integer is
for one inherently unsafe and can cause a collision. On the other hand,
it should be unnecessary given that the files will be cleaned up after
the OCR run.

Since we don't know if OCR runs might be parallel in the future, this
commit implements thread-safe and deterministic directory-creation.

Additionally it fixes the call to `_cleanup` by `consume`. In the
current implementation `_cleanup` will not be called if the last
consumed document failed with an `OCRError`, this commit fixes this.
This commit is contained in:
Pit Kleyersburg 2016-02-14 17:40:37 +01:00
parent bbe7a02b4d
commit 46f8f492f5

View File

@ -1,15 +1,16 @@
import datetime import datetime
import glob import tempfile
from multiprocessing.pool import Pool from multiprocessing.pool import Pool
import itertools import itertools
import langdetect import langdetect
import os import os
import random
import re import re
import subprocess import subprocess
import pyocr import pyocr
import shutil
from PIL import Image from PIL import Image
@ -111,34 +112,35 @@ class Consumer(object):
Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER) Log.info("Consuming {}".format(doc), Log.COMPONENT_CONSUMER)
pngs = self._get_greyscale(doc) tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
pngs = self._get_greyscale(tempdir, doc)
try: try:
text = self._get_ocr(pngs) text = self._get_ocr(pngs)
self._store(text, doc)
except OCRError: except OCRError:
self._ignore.append(doc) self._ignore.append(doc)
Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER) Log.error("OCR FAILURE: {}".format(doc), Log.COMPONENT_CONSUMER)
continue continue
finally:
self._cleanup(tempdir, doc)
self._store(text, doc) def _get_greyscale(self, tempdir, doc):
self._cleanup(pngs, doc)
def _get_greyscale(self, doc):
Log.debug( Log.debug(
"Generating greyscale image from {}".format(doc), "Generating greyscale image from {}".format(doc),
Log.COMPONENT_CONSUMER Log.COMPONENT_CONSUMER
) )
i = random.randint(1000000, 9999999) png = os.path.join(tempdir, "convert.png")
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen(( subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8", self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", doc, png "-type", "grayscale", doc, png
)).wait() )).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i)))) pngs = [os.path.join(tempdir, f) for f in os.listdir(tempdir) if f.startswith("convert")]
return sorted(filter(lambda f: os.path.isfile(f), pngs))
@staticmethod @staticmethod
def _guess_language(text): def _guess_language(text):
@ -303,14 +305,14 @@ class Consumer(object):
Log.debug("Encrypting", Log.COMPONENT_CONSUMER) Log.debug("Encrypting", Log.COMPONENT_CONSUMER)
encrypted.write(GnuPG.encrypted(unencrypted)) encrypted.write(GnuPG.encrypted(unencrypted))
def _cleanup(self, pngs, doc): def _cleanup(self, tempdir, doc):
# Remove temporary directory recursively
Log.debug("Deleting directory {}".format(tempdir), Log.COMPONENT_CONSUMER)
shutil.rmtree(tempdir)
png_glob = os.path.join( # Remove doc
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0])) Log.debug("Deleting document {}".format(doc), Log.COMPONENT_CONSUMER)
os.unlink(doc)
for f in list(glob.glob(png_glob)) + [doc]:
Log.debug("Deleting {}".format(f), Log.COMPONENT_CONSUMER)
os.unlink(f)
def _is_ready(self, doc): def _is_ready(self, doc):
""" """