diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 4ea2e0215..3f3b9e9a3 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,5 +1,8 @@ import datetime import glob +from multiprocessing.pool import Pool + +import itertools import langdetect import os import random @@ -21,6 +24,13 @@ from .models import Sender, Tag, Document from .languages import ISO639 +def image_to_string(args): + self, png, lang = args + with Image.open(os.path.join(self.SCRATCH, png)) as f: + self._render(" {}".format(f.filename), 3) + return self.OCR.image_to_string(f, lang=lang) + + class OCRError(Exception): pass @@ -42,6 +52,7 @@ class Consumer(Renderable): SCRATCH = settings.SCRATCH_DIR CONVERT = settings.CONVERT_BINARY CONSUME = settings.CONSUMPTION_DIR + THREADS = settings.OCR_THREADS OCR = pyocr.get_available_tools()[0] DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE @@ -182,11 +193,10 @@ class Consumer(Renderable): self._render(" Parsing for {}".format(lang), 2) - r = "" - for png in pngs: - with Image.open(os.path.join(self.SCRATCH, png)) as f: - self._render(" {}".format(f.filename), 3) - r += self.OCR.image_to_string(f, lang=lang) + with Pool(processes=self.THREADS) as pool: + r = pool.map(image_to_string, + itertools.product([self], pngs, [lang])) + r = "".join(r) # Strip out excess white space to allow matching to go smoother return re.sub(r"\s+", " ", r) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index f9a124049..88be33a8c 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -144,6 +144,9 @@ MEDIA_URL = "/media/" # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = "eng" +# The amount of threads to use for OCR +OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS") + # If this is true, any failed attempts to OCR a PDF will result in the PDF being # indexed anyway, with whatever we could get. If it's False, the file will # simply be left in the CONSUMPTION_DIR.