Merge pull request #32 from pitkley/feature/single-page-langdetect

Detect language only on first page of PDF
This commit is contained in:
Daniel Quinn 2016-02-14 16:56:30 +00:00
commit 4689e2b975

View File

@ -9,6 +9,7 @@ import random
import re
import subprocess
import math
import pyocr
from PIL import Image
@ -152,9 +153,14 @@ class Consumer(Renderable):
simple language detection trial & error.
"""
if not pngs:
raise OCRError
self._render(" OCRing the document", 2)
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
# Since the division gets rounded down by int, this calculation works for every edge-case, i.e. 1
middle = int(len(pngs) / 2)
raw_text = self._ocr([pngs[middle]], self.DEFAULT_OCR_LANGUAGE)
guessed_language = self._guess_language(raw_text)
@ -166,10 +172,14 @@ class Consumer(Renderable):
"with what we have.",
1
)
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
return raw_text
raise OCRError
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
return raw_text
try:
@ -183,6 +193,8 @@ class Consumer(Renderable):
),
0
)
raw_text = self._ocr(pngs[:middle], self.DEFAULT_OCR_LANGUAGE) + raw_text
raw_text += self._ocr(pngs[middle+1:], self.DEFAULT_OCR_LANGUAGE)
return raw_text
raise OCRError
@ -191,6 +203,9 @@ class Consumer(Renderable):
Performs a single OCR attempt.
"""
if not pngs:
return ""
self._render(" Parsing for {}".format(lang), 2)
with Pool(processes=self.THREADS) as pool: