mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Moved pyocr.get_available_tools() into a method
This commit is contained in:
parent
5f0962bc3e
commit
3a7923e32d
@ -8,7 +8,8 @@ matrix:
|
|||||||
env: TOXENV=py34
|
env: TOXENV=py34
|
||||||
- python: 3.5
|
- python: 3.5
|
||||||
env: TOXENV=py35
|
env: TOXENV=py35
|
||||||
- env: TOXENV=pep8
|
- python: 3.5
|
||||||
|
env: TOXENV=pep8
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- pip install --requirement requirements.txt
|
- pip install --requirement requirements.txt
|
||||||
|
@ -26,18 +26,6 @@ from .models import Sender, Tag, Document
|
|||||||
from .languages import ISO639
|
from .languages import ISO639
|
||||||
|
|
||||||
|
|
||||||
def image_to_string(args):
|
|
||||||
self, png, lang = args
|
|
||||||
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
|
||||||
if self.OCR.can_detect_orientation():
|
|
||||||
try:
|
|
||||||
orientation = self.OCR.detect_orientation(f, lang=lang)
|
|
||||||
f = f.rotate(orientation["angle"], expand=1)
|
|
||||||
except TesseractError:
|
|
||||||
pass
|
|
||||||
return self.OCR.image_to_string(f, lang=lang)
|
|
||||||
|
|
||||||
|
|
||||||
class OCRError(Exception):
|
class OCRError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -61,7 +49,6 @@ class Consumer(object):
|
|||||||
CONSUME = settings.CONSUMPTION_DIR
|
CONSUME = settings.CONSUMPTION_DIR
|
||||||
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
|
||||||
|
|
||||||
OCR = pyocr.get_available_tools()[0]
|
|
||||||
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
|
||||||
|
|
||||||
REGEX_TITLE = re.compile(
|
REGEX_TITLE = re.compile(
|
||||||
@ -239,12 +226,24 @@ class Consumer(object):
|
|||||||
|
|
||||||
with Pool(processes=self.THREADS) as pool:
|
with Pool(processes=self.THREADS) as pool:
|
||||||
r = pool.map(
|
r = pool.map(
|
||||||
image_to_string, itertools.product([self], pngs, [lang]))
|
self.image_to_string, itertools.product(pngs, [lang]))
|
||||||
r = " ".join(r)
|
r = " ".join(r)
|
||||||
|
|
||||||
# Strip out excess white space to allow matching to go smoother
|
# Strip out excess white space to allow matching to go smoother
|
||||||
return re.sub(r"\s+", " ", r)
|
return re.sub(r"\s+", " ", r)
|
||||||
|
|
||||||
|
def image_to_string(self, args):
|
||||||
|
png, lang = args
|
||||||
|
ocr = pyocr.get_available_tools()[0]
|
||||||
|
with Image.open(os.path.join(self.SCRATCH, png)) as f:
|
||||||
|
if ocr.can_detect_orientation():
|
||||||
|
try:
|
||||||
|
orientation = ocr.detect_orientation(f, lang=lang)
|
||||||
|
f = f.rotate(orientation["angle"], expand=1)
|
||||||
|
except TesseractError:
|
||||||
|
pass
|
||||||
|
return ocr.image_to_string(f, lang=lang)
|
||||||
|
|
||||||
def _guess_attributes_from_name(self, parseable):
|
def _guess_attributes_from_name(self, parseable):
|
||||||
"""
|
"""
|
||||||
We use a crude naming convention to make handling the sender, title,
|
We use a crude naming convention to make handling the sender, title,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user