Added language detection

This commit is contained in:
Daniel Quinn 2016-01-23 02:33:29 +00:00
parent bcdcfbaee0
commit fdb29f739f

View File

@ -16,10 +16,15 @@ from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from django.utils import timezone from django.utils import timezone
from documents.models import Document, Sender from ...languages import ISO639
from ...models import Document, Sender
from paperless.db import GnuPG from paperless.db import GnuPG
class OCRError(BaseException):
pass
class Command(BaseCommand): class Command(BaseCommand):
""" """
Loop over every file found in CONSUMPTION_DIR and: Loop over every file found in CONSUMPTION_DIR and:
@ -37,7 +42,7 @@ class Command(BaseCommand):
CONSUME = settings.CONSUMPTION_DIR CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0] OCR = pyocr.get_available_tools()[0]
OCR_LANG = settings.TESSERACT_LANGUAGE DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$") PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
@ -47,6 +52,7 @@ class Command(BaseCommand):
self.verbosity = 0 self.verbosity = 0
self.stats = {} self.stats = {}
self._ignore = []
BaseCommand.__init__(self, *args, **kwargs) BaseCommand.__init__(self, *args, **kwargs)
@ -81,13 +87,22 @@ class Command(BaseCommand):
if not re.match(self.PARSER_REGEX_TITLE, pdf): if not re.match(self.PARSER_REGEX_TITLE, pdf):
continue continue
if pdf in self._ignore:
continue
if self._is_ready(pdf): if self._is_ready(pdf):
continue continue
self._render("Consuming {}".format(pdf), 1) self._render("Consuming {}".format(pdf), 1)
pngs = self._get_greyscale(pdf) pngs = self._get_greyscale(pdf)
text = self._get_ocr(pngs)
try:
text = self._get_ocr(pngs)
except OCRError:
self._ignore.append(pdf)
self._render("OCR FAILURE: {}".format(pdf), 0)
continue
self._store(text, pdf) self._store(text, pdf)
self._cleanup(pngs, pdf) self._cleanup(pngs, pdf)
@ -131,23 +146,51 @@ class Command(BaseCommand):
def _get_ocr(self, pngs): def _get_ocr(self, pngs):
self._render(" OCRing the PDF", 2) self._render(" OCRing the PDF", 1)
raw_text = self._ocr(pngs, self.OCR_LANG) raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
guessed_language = langdetect.detect(raw_text) guessed_language = langdetect.detect(raw_text)
if guessed_language == self.OCR_LANG:
self._render(" Language detected: {}".format(guessed_language), 2)
if guessed_language not in ISO639:
self._render("Language detection failed!", 0)
if settings.FORGIVING_OCR:
self._render(
"As FORGIVING_OCR is enabled, we're going to make the best "
"with what we have.",
1
)
return raw_text
raise OCRError
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
return raw_text return raw_text
return self._ocr(pngs, guessed_language) try:
return self._ocr(pngs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self._render(
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
),
0
)
return raw_text
raise OCRError
def _ocr(self, pngs, lang): def _ocr(self, pngs, lang):
self._render(" Parsing for {}".format(lang), 2)
r = "" r = ""
for png in pngs: for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f: with Image.open(os.path.join(self.SCRATCH, png)) as f:
self._render(" {}".format(f.filename), 3) self._render(" {}".format(f.filename), 3)
r += self.OCR.image_to_string(f, lang=self.OCR_LANG) r += self.OCR.image_to_string(f, lang=lang)
r += "\n\n\n\n\n\n\n\n" r += "\n\n\n\n\n\n\n\n"
return r return r