diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 0dbf0988a..9fcc3ee49 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -1,6 +1,6 @@ import datetime import glob -import gnupg +import langdetect import os import random import re @@ -44,9 +44,10 @@ class Command(BaseCommand): PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$") def __init__(self, *args, **kwargs): + self.verbosity = 0 self.stats = {} - self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) + BaseCommand.__init__(self, *args, **kwargs) def handle(self, *args, **options): @@ -132,6 +133,16 @@ class Command(BaseCommand): self._render(" OCRing the PDF", 2) + raw_text = self._ocr(pngs, self.OCR_LANG) + + guessed_language = langdetect.detect(raw_text) + if guessed_language == self.OCR_LANG: + return raw_text + + return self._ocr(pngs, guessed_language) + + def _ocr(self, pngs, lang): + r = "" for png in pngs: with Image.open(os.path.join(self.SCRATCH, png)) as f: