Merge pull request #31 from pitkley/feature/paralellism

This is great.  It seriously sped up the OCR time.
This commit is contained in:
Daniel Quinn 2016-02-14 15:29:05 +00:00
commit 88acf50fe0
2 changed files with 18 additions and 5 deletions

View File

@ -1,5 +1,8 @@
import datetime
import glob
from multiprocessing.pool import Pool
import itertools
import langdetect
import os
import random
@ -21,6 +24,13 @@ from .models import Sender, Tag, Document
from .languages import ISO639
def image_to_string(args):
self, png, lang = args
with Image.open(os.path.join(self.SCRATCH, png)) as f:
self._render(" {}".format(f.filename), 3)
return self.OCR.image_to_string(f, lang=lang)
class OCRError(Exception):
pass
@ -42,6 +52,7 @@ class Consumer(Renderable):
SCRATCH = settings.SCRATCH_DIR
CONVERT = settings.CONVERT_BINARY
CONSUME = settings.CONSUMPTION_DIR
THREADS = settings.OCR_THREADS
OCR = pyocr.get_available_tools()[0]
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
@ -182,11 +193,10 @@ class Consumer(Renderable):
self._render(" Parsing for {}".format(lang), 2)
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
self._render(" {}".format(f.filename), 3)
r += self.OCR.image_to_string(f, lang=lang)
with Pool(processes=self.THREADS) as pool:
r = pool.map(image_to_string,
itertools.product([self], pngs, [lang]))
r = "".join(r)
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)

View File

@ -144,6 +144,9 @@ MEDIA_URL = "/media/"
# documents. It should be a 3-letter language code consistent with ISO 639.
OCR_LANGUAGE = "eng"
# The amount of threads to use for OCR
OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS")
# If this is true, any failed attempts to OCR a PDF will result in the PDF being
# indexed anyway, with whatever we could get. If it's False, the file will
# simply be left in the CONSUMPTION_DIR.