mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #31 from pitkley/feature/paralellism
This is great. It seriously sped up the OCR time.
This commit is contained in:
commit
88acf50fe0
@ -1,5 +1,8 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import glob
|
import glob
|
||||||
|
from multiprocessing.pool import Pool
|
||||||
|
|
||||||
|
import itertools
|
||||||
import langdetect
|
import langdetect
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
@ -21,6 +24,13 @@ from .models import Sender, Tag, Document
|
|||||||
from .languages import ISO639
|
from .languages import ISO639
|
||||||
|
|
||||||
|
|
||||||
|
def image_to_string(args):
|
||||||
|
self, png, lang = args
|
||||||
|
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||||
|
self._render(" {}".format(f.filename), 3)
|
||||||
|
return self.OCR.image_to_string(f, lang=lang)
|
||||||
|
|
||||||
|
|
||||||
class OCRError(Exception):
|
class OCRError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -42,6 +52,7 @@ class Consumer(Renderable):
|
|||||||
SCRATCH = settings.SCRATCH_DIR
|
SCRATCH = settings.SCRATCH_DIR
|
||||||
CONVERT = settings.CONVERT_BINARY
|
CONVERT = settings.CONVERT_BINARY
|
||||||
CONSUME = settings.CONSUMPTION_DIR
|
CONSUME = settings.CONSUMPTION_DIR
|
||||||
|
THREADS = settings.OCR_THREADS
|
||||||
|
|
||||||
OCR = pyocr.get_available_tools()[0]
|
OCR = pyocr.get_available_tools()[0]
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
@ -182,11 +193,10 @@ class Consumer(Renderable):
|
|||||||
|
|
||||||
self._render(" Parsing for {}".format(lang), 2)
|
self._render(" Parsing for {}".format(lang), 2)
|
||||||
|
|
||||||
r = ""
|
with Pool(processes=self.THREADS) as pool:
|
||||||
for png in pngs:
|
r = pool.map(image_to_string,
|
||||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
itertools.product([self], pngs, [lang]))
|
||||||
self._render(" {}".format(f.filename), 3)
|
r = "".join(r)
|
||||||
r += self.OCR.image_to_string(f, lang=lang)
|
|
||||||
|
|
||||||
# Strip out excess white space to allow matching to go smoother
|
# Strip out excess white space to allow matching to go smoother
|
||||||
return re.sub(r"\s+", " ", r)
|
return re.sub(r"\s+", " ", r)
|
||||||
|
@ -144,6 +144,9 @@ MEDIA_URL = "/media/"
|
|||||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||||
OCR_LANGUAGE = "eng"
|
OCR_LANGUAGE = "eng"
|
||||||
|
|
||||||
|
# The amount of threads to use for OCR
|
||||||
|
OCR_THREADS = os.environ.get("PAPERLESS_OCR_THREADS")
|
||||||
|
|
||||||
# If this is true, any failed attempts to OCR a PDF will result in the PDF being
|
# If this is true, any failed attempts to OCR a PDF will result in the PDF being
|
||||||
# indexed anyway, with whatever we could get. If it's False, the file will
|
# indexed anyway, with whatever we could get. If it's False, the file will
|
||||||
# simply be left in the CONSUMPTION_DIR.
|
# simply be left in the CONSUMPTION_DIR.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user