From fdb29f739f03192ee3b20b393826cf89ba566778 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sat, 23 Jan 2016 02:33:29 +0000 Subject: [PATCH] Added language detection --- .../management/commands/document_consumer.py | 59 ++++++++++++++++--- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 9fcc3ee49..2cb8e52a3 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -16,10 +16,15 @@ from django.core.management.base import BaseCommand, CommandError from django.template.defaultfilters import slugify from django.utils import timezone -from documents.models import Document, Sender +from ...languages import ISO639 +from ...models import Document, Sender from paperless.db import GnuPG +class OCRError(BaseException): + pass + + class Command(BaseCommand): """ Loop over every file found in CONSUMPTION_DIR and: @@ -37,7 +42,7 @@ class Command(BaseCommand): CONSUME = settings.CONSUMPTION_DIR OCR = pyocr.get_available_tools()[0] - OCR_LANG = settings.TESSERACT_LANGUAGE + DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf") PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$") @@ -47,6 +52,7 @@ class Command(BaseCommand): self.verbosity = 0 self.stats = {} + self._ignore = [] BaseCommand.__init__(self, *args, **kwargs) @@ -81,13 +87,22 @@ class Command(BaseCommand): if not re.match(self.PARSER_REGEX_TITLE, pdf): continue + if pdf in self._ignore: + continue + if self._is_ready(pdf): continue self._render("Consuming {}".format(pdf), 1) pngs = self._get_greyscale(pdf) - text = self._get_ocr(pngs) + + try: + text = self._get_ocr(pngs) + except OCRError: + self._ignore.append(pdf) + self._render("OCR FAILURE: {}".format(pdf), 0) + continue self._store(text, pdf) self._cleanup(pngs, pdf) @@ -131,23 +146,51 @@ class Command(BaseCommand): def _get_ocr(self, pngs): - self._render(" OCRing the PDF", 2) + self._render(" OCRing the PDF", 1) - raw_text = self._ocr(pngs, self.OCR_LANG) + raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE) guessed_language = langdetect.detect(raw_text) - if guessed_language == self.OCR_LANG: + + self._render(" Language detected: {}".format(guessed_language), 2) + + if guessed_language not in ISO639: + self._render("Language detection failed!", 0) + if settings.FORGIVING_OCR: + self._render( + "As FORGIVING_OCR is enabled, we're going to make the best " + "with what we have.", + 1 + ) + return raw_text + raise OCRError + + if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE: return raw_text - return self._ocr(pngs, guessed_language) + try: + return self._ocr(pngs, ISO639[guessed_language]) + except pyocr.pyocr.tesseract.TesseractError: + if settings.FORGIVING_OCR: + self._render( + "OCR for {} failed, but we're going to stick with what " + "we've got since FORGIVING_OCR is enabled.".format( + guessed_language + ), + 0 + ) + return raw_text + raise OCRError def _ocr(self, pngs, lang): + self._render(" Parsing for {}".format(lang), 2) + r = "" for png in pngs: with Image.open(os.path.join(self.SCRATCH, png)) as f: self._render(" {}".format(f.filename), 3) - r += self.OCR.image_to_string(f, lang=self.OCR_LANG) + r += self.OCR.image_to_string(f, lang=lang) r += "\n\n\n\n\n\n\n\n" return r