mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Enable parallel OCR processing
At the moment, every page in a PDF will be processed one by one using tesseract. Since the processing of a single page is independent from every other page, one can make use of multi-core machines. This PR introduces a multiprocessing pool to process multiple pages simultaneously. The amount of threads to use can be specified in the environment variable `PAPERLESS_OCR_THREADS`. This will default to the number of cores/hyperthreads Python detects for your system.
This commit is contained in:
		| @@ -1,5 +1,8 @@ | ||||
| import datetime | ||||
| import glob | ||||
| from multiprocessing.pool import Pool | ||||
|  | ||||
| import itertools | ||||
| import langdetect | ||||
| import os | ||||
| import random | ||||
| @@ -21,6 +24,13 @@ from .models import Sender, Tag, Document | ||||
| from .languages import ISO639 | ||||
|  | ||||
|  | ||||
| def image_to_string(args): | ||||
|     self, png, lang = args | ||||
|     with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||
|         self._render("    {}".format(f.filename), 3) | ||||
|         return self.OCR.image_to_string(f, lang=lang) | ||||
|  | ||||
|  | ||||
| class OCRError(Exception): | ||||
|     pass | ||||
|  | ||||
| @@ -42,6 +52,7 @@ class Consumer(Renderable): | ||||
|     SCRATCH = settings.SCRATCH_DIR | ||||
|     CONVERT = settings.CONVERT_BINARY | ||||
|     CONSUME = settings.CONSUMPTION_DIR | ||||
|     THREADS = settings.OCR_THREADS | ||||
|  | ||||
|     OCR = pyocr.get_available_tools()[0] | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
| @@ -182,11 +193,10 @@ class Consumer(Renderable): | ||||
|  | ||||
|         self._render("    Parsing for {}".format(lang), 2) | ||||
|  | ||||
|         r = "" | ||||
|         for png in pngs: | ||||
|             with Image.open(os.path.join(self.SCRATCH, png)) as f: | ||||
|                 self._render("    {}".format(f.filename), 3) | ||||
|                 r += self.OCR.image_to_string(f, lang=lang) | ||||
|         with Pool(processes=self.THREADS) as pool: | ||||
|             r = pool.map(image_to_string, | ||||
|                          itertools.product([self], pngs, [lang])) | ||||
|             r = "".join(r) | ||||
|  | ||||
|         # Strip out excess white space to allow matching to go smoother | ||||
|         return re.sub(r"\s+", " ", r) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Pit Kleyersburg
					Pit Kleyersburg