From 8908bc259e2e41b74285882c2d9c9a7dec577bb0 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 18 Nov 2020 13:23:30 +0100 Subject: [PATCH] updated logging, logging for the mail consumer to see whats happening --- docs/advanced_usage.rst | 4 +- src/documents/consumer.py | 14 +- src/documents/loggers.py | 17 ++ src/documents/parsers.py | 12 +- src/paperless/settings.py | 8 + src/paperless_mail/admin.py | 11 -- src/paperless_mail/mail.py | 237 ++++++++++++++++---------- src/paperless_mail/tasks.py | 5 +- src/paperless_mail/tests/test_mail.py | 38 +++-- src/paperless_tesseract/parsers.py | 20 +-- 10 files changed, 214 insertions(+), 152 deletions(-) diff --git a/docs/advanced_usage.rst b/docs/advanced_usage.rst index 3b48ea582..218cfa8b7 100644 --- a/docs/advanced_usage.rst +++ b/docs/advanced_usage.rst @@ -175,8 +175,6 @@ then put the path to that script in ``paperless.conf`` with the variable name of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or ``PAPERLESS_POST_CONSUME_SCRIPT``. -.. TODO HYPEREF TO CONFIG - .. important:: These scripts are executed in a **blocking** process, which means that if @@ -319,6 +317,6 @@ for use in filenames. .. code:: PAPERLESS_FILENAME_FORMAT=../../my/custom/location/{title} - + However, keep in mind that inside docker, if files get stored outside of the predefined volumes, they will be lost after a restart of paperless. diff --git a/src/documents/consumer.py b/src/documents/consumer.py index f0cd4dd67..913f324c7 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -12,6 +12,7 @@ from django.utils import timezone from paperless.db import GnuPG from .classifier import DocumentClassifier, IncompatibleClassifierVersionError from .file_handling import generate_filename, create_source_path_directory +from .loggers import LoggingMixin from .models import Document, FileInfo, Correspondent, DocumentType, Tag from .parsers import ParseError, get_parser_class from .signals import ( @@ -24,12 +25,10 @@ class ConsumerError(Exception): pass -class Consumer: +class Consumer(LoggingMixin): def __init__(self): - - self.logger = logging.getLogger(__name__) - self.logging_group = None + super().__init__() self.path = None self.filename = None self.override_title = None @@ -74,11 +73,6 @@ class Consumer: os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) - def log(self, level, message): - getattr(self.logger, level)(message, extra={ - "group": self.logging_group - }) - def try_consume_file(self, path, override_filename=None, @@ -100,7 +94,7 @@ class Consumer: # this is for grouping logging entries for this particular file # together. - self.logging_group = uuid.uuid4() + self.renew_logging_group() # Make sure that preconditions for consuming the file are met. diff --git a/src/documents/loggers.py b/src/documents/loggers.py index d9c90ab16..fd20e1288 100644 --- a/src/documents/loggers.py +++ b/src/documents/loggers.py @@ -1,4 +1,5 @@ import logging +import uuid class PaperlessHandler(logging.Handler): @@ -13,3 +14,19 @@ class PaperlessHandler(logging.Handler): kwargs["group"] = record.group Log.objects.create(**kwargs) + + +class LoggingMixin: + + logging_group = None + + def renew_logging_group(self): + self.logging_group = uuid.uuid4() + + def log(self, level, message): + target = ".".join([self.__class__.__module__, self.__class__.__name__]) + logger = logging.getLogger(target) + + getattr(logger, level)(message, extra={ + "group": self.logging_group + }) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 600e4fc93..2fab6bc44 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -20,6 +20,7 @@ from django.utils import timezone # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits +from documents.loggers import LoggingMixin from documents.signals import document_consumer_declaration # TODO: isnt there a date parsing library for this? @@ -101,17 +102,17 @@ class ParseError(Exception): pass -class DocumentParser: +class DocumentParser(LoggingMixin): """ Subclass this to make your own parser. Have a look at `paperless_tesseract.parsers` for inspiration. """ def __init__(self, path, logging_group): + super().__init__() + self.logging_group = logging_group self.document_path = path self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) - self.logger = logging.getLogger(__name__) - self.logging_group = logging_group def get_thumbnail(self): """ @@ -222,11 +223,6 @@ class DocumentParser: return date - def log(self, level, message): - getattr(self.logger, level)(message, extra={ - "group": self.logging_group - }) - def cleanup(self): self.log("debug", "Deleting directory {}".format(self.tempdir)) shutil.rmtree(self.tempdir) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 311913c3e..3661c3d02 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -257,6 +257,14 @@ LOGGING = { "handlers": ["dbhandler", "streamhandler"], "level": "DEBUG" }, + "paperless_mail": { + "handlers": ["dbhandler", "streamhandler"], + "level": "DEBUG" + }, + "paperless_tesseract": { + "handlers": ["dbhandler", "streamhandler"], + "level": "DEBUG" + }, }, } diff --git a/src/paperless_mail/admin.py b/src/paperless_mail/admin.py index b64a68637..130e34ad1 100644 --- a/src/paperless_mail/admin.py +++ b/src/paperless_mail/admin.py @@ -1,18 +1,7 @@ from django.contrib import admin -from django import forms - from paperless_mail.models import MailAccount, MailRule -class MailAccountForm(forms.ModelForm): - - password = forms.CharField(widget=forms.PasswordInput) - - class Meta: - fields = '__all__' - model = MailAccount - - class MailAccountAdmin(admin.ModelAdmin): list_display = ("name", "imap_server", "username") diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index ce8bf9459..dd1e68b35 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -8,6 +8,7 @@ from django_q.tasks import async_task from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \ MailboxFolderSelectError +from documents.loggers import LoggingMixin from documents.models import Correspondent from paperless_mail.models import MailAccount, MailRule @@ -83,72 +84,6 @@ def make_criterias(rule): return {**criterias, **get_rule_action(rule).get_criteria()} -def handle_mail_account(account): - - if account.imap_security == MailAccount.IMAP_SECURITY_NONE: - mailbox = MailBoxUnencrypted(account.imap_server, account.imap_port) - elif account.imap_security == MailAccount.IMAP_SECURITY_STARTTLS: - mailbox = MailBox(account.imap_server, account.imap_port, starttls=True) - elif account.imap_security == MailAccount.IMAP_SECURITY_SSL: - mailbox = MailBox(account.imap_server, account.imap_port) - else: - raise ValueError("Unknown IMAP security") - - total_processed_files = 0 - - with mailbox as M: - - try: - M.login(account.username, account.password) - except Exception: - raise MailError( - f"Error while authenticating account {account.name}") - - for rule in account.rules.all(): - - try: - M.folder.set(rule.folder) - except MailboxFolderSelectError: - raise MailError( - f"Rule {rule.name}: Folder {rule.folder} does not exist " - f"in account {account.name}") - - criterias = make_criterias(rule) - - try: - messages = M.fetch(criteria=AND(**criterias), mark_seen=False) - except Exception: - raise MailError( - f"Rule {rule.name}: Error while fetching folder " - f"{rule.folder} of account {account.name}") - - post_consume_messages = [] - - for message in messages: - try: - processed_files = handle_message(message, rule) - except Exception: - raise MailError( - f"Rule {rule.name}: Error while processing mail " - f"{message.uid} of account {account.name}") - if processed_files > 0: - post_consume_messages.append(message.uid) - - total_processed_files += processed_files - try: - get_rule_action(rule).post_consume( - M, - post_consume_messages, - rule.action_parameter) - - except Exception: - raise MailError( - f"Rule {rule.name}: Error while processing post-consume " - f"actions for account {account.name}") - - return total_processed_files - - def get_title(message, att, rule): if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT: title = message.subject @@ -189,39 +124,155 @@ def get_correspondent(message, rule): return correspondent -def handle_message(message, rule): - if not message.attachments: - return 0 +def get_mailbox(server, port, security): + if security == MailAccount.IMAP_SECURITY_NONE: + mailbox = MailBoxUnencrypted(server, port) + elif security == MailAccount.IMAP_SECURITY_STARTTLS: + mailbox = MailBox(server, port, starttls=True) + elif security == MailAccount.IMAP_SECURITY_SSL: + mailbox = MailBox(server, port) + else: + raise ValueError("Unknown IMAP security") + return mailbox - correspondent = get_correspondent(message, rule) - tag = rule.assign_tag - doc_type = rule.assign_document_type +class MailAccountHandler(LoggingMixin): - processed_attachments = 0 + def handle_mail_account(self, account): - for att in message.attachments: + self.renew_logging_group() - title = get_title(message, att, rule) + self.log('debug', f"Processing mail account {account}") - # TODO: check with parsers what files types are supported - if att.content_type == 'application/pdf': + total_processed_files = 0 - os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR) - with open(temp_filename, 'wb') as f: - f.write(att.payload) + with get_mailbox(account.imap_server, + account.imap_port, + account.imap_security) as M: - async_task( - "documents.tasks.consume_file", - path=temp_filename, - override_filename=att.filename, - override_title=title, - override_correspondent_id=correspondent.id if correspondent else None, - override_document_type_id=doc_type.id if doc_type else None, - override_tag_ids=[tag.id] if tag else None, - task_name=f"Mail: {att.filename}" - ) + try: + M.login(account.username, account.password) + except Exception: + raise MailError( + f"Error while authenticating account {account.name}") - processed_attachments += 1 + self.log('debug', f"Account {account}: Processing " + f"{account.rules.count()} rule(s)") - return processed_attachments + for rule in account.rules.all(): + self.log( + 'debug', + f"Account {account}: Processing rule {rule.name}") + + self.log( + 'debug', + f"Rule {account}.{rule}: Selecting folder {rule.folder}") + + try: + M.folder.set(rule.folder) + except MailboxFolderSelectError: + raise MailError( + f"Rule {rule.name}: Folder {rule.folder} does not exist " + f"in account {account.name}") + + criterias = make_criterias(rule) + + self.log( + 'debug', + f"Rule {account}.{rule}: Searching folder with criteria " + f"{str(AND(**criterias))}") + + try: + messages = M.fetch(criteria=AND(**criterias), mark_seen=False) + except Exception: + raise MailError( + f"Rule {rule.name}: Error while fetching folder " + f"{rule.folder} of account {account.name}") + + post_consume_messages = [] + + mails_processed = 0 + + for message in messages: + try: + processed_files = self.handle_message(message, rule) + except Exception: + raise MailError( + f"Rule {rule.name}: Error while processing mail " + f"{message.uid} of account {account.name}") + if processed_files > 0: + post_consume_messages.append(message.uid) + + total_processed_files += processed_files + mails_processed += 1 + + self.log( + 'debug', + f"Rule {account}.{rule}: Processed {mails_processed} " + f"matching mail(s)") + + self.log( + 'debug', + f"Rule {account}.{rule}: Running mail actions on " + f"{len(post_consume_messages)} mails") + + try: + get_rule_action(rule).post_consume( + M, + post_consume_messages, + rule.action_parameter) + + except Exception: + raise MailError( + f"Rule {rule.name}: Error while processing post-consume " + f"actions for account {account.name}") + + return total_processed_files + + def handle_message(self, message, rule): + if not message.attachments: + return 0 + + self.log( + 'debug', + f"Rule {rule.account}.{rule}: " + f"Processing mail {message.subject} from {message.from_} with " + f"{len(message.attachments)} attachment(s)") + + correspondent = get_correspondent(message, rule) + tag = rule.assign_tag + doc_type = rule.assign_document_type + + processed_attachments = 0 + + for att in message.attachments: + + title = get_title(message, att, rule) + + # TODO: check with parsers what files types are supported + if att.content_type == 'application/pdf': + + os.makedirs(settings.SCRATCH_DIR, exist_ok=True) + _, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR) + with open(temp_filename, 'wb') as f: + f.write(att.payload) + + self.log( + 'info', + f"Rule {rule.account}.{rule}: " + f"Consuming attachment {att.filename} from mail " + f"{message.subject} from {message.from_}") + + async_task( + "documents.tasks.consume_file", + path=temp_filename, + override_filename=att.filename, + override_title=title, + override_correspondent_id=correspondent.id if correspondent else None, + override_document_type_id=doc_type.id if doc_type else None, + override_tag_ids=[tag.id] if tag else None, + task_name=f"Mail: {att.filename}" + ) + + processed_attachments += 1 + + return processed_attachments diff --git a/src/paperless_mail/tasks.py b/src/paperless_mail/tasks.py index d34941a8a..dbef91c94 100644 --- a/src/paperless_mail/tasks.py +++ b/src/paperless_mail/tasks.py @@ -1,13 +1,14 @@ import logging from paperless_mail import mail +from paperless_mail.mail import MailAccountHandler from paperless_mail.models import MailAccount def process_mail_accounts(): total_new_documents = 0 for account in MailAccount.objects.all(): - total_new_documents += mail.handle_mail_account(account) + total_new_documents += MailAccountHandler().handle_mail_account(account) if total_new_documents > 0: return f"Added {total_new_documents} document(s)." @@ -18,6 +19,6 @@ def process_mail_accounts(): def process_mail_account(name): account = MailAccount.objects.find(name=name) if account: - mail.handle_mail_account(account) + MailAccountHandler().handle_mail_account(account) else: logging.error("Unknown mail acccount: {}".format(name)) diff --git a/src/paperless_mail/tests/test_mail.py b/src/paperless_mail/tests/test_mail.py index 20cf17ec7..a9d57fcb8 100644 --- a/src/paperless_mail/tests/test_mail.py +++ b/src/paperless_mail/tests/test_mail.py @@ -7,7 +7,7 @@ from django.test import TestCase from imap_tools import MailMessageFlags, MailboxFolderSelectError from documents.models import Correspondent -from paperless_mail.mail import get_correspondent, get_title, handle_message, handle_mail_account, MailError +from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title from paperless_mail.models import MailRule, MailAccount @@ -126,6 +126,8 @@ class TestMail(TestCase): self.reset_bogus_mailbox() + self.mail_account_handler = MailAccountHandler() + def reset_bogus_mailbox(self): self.bogus_mailbox.messages = [] self.bogus_mailbox.messages_spam = [] @@ -182,6 +184,7 @@ class TestMail(TestCase): def test_handle_message(self): message = namedtuple('MailMessage', []) message.subject = "the message title" + message.from_ = "Myself" att = namedtuple('Attachment', []) att.filename = "test1.pdf" @@ -200,9 +203,10 @@ class TestMail(TestCase): message.attachments = [att, att2, att3] - rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME) + account = MailAccount() + rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account) - result = handle_message(message, rule) + result = self.mail_account_handler.handle_message(message, rule) self.assertEqual(result, 2) @@ -224,7 +228,7 @@ class TestMail(TestCase): message.attachments = [] rule = MailRule() - result = handle_message(message, rule) + result = self.mail_account_handler.handle_message(message, rule) self.assertFalse(m.called) self.assertEqual(result, 0) @@ -235,11 +239,13 @@ class TestMail(TestCase): rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ) + self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(self.async_task.call_count, 0) self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2) - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) self.assertEqual(self.async_task.call_count, 2) self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0) + self.assertEqual(len(self.bogus_mailbox.messages), 3) def test_handle_mail_account_delete(self): @@ -249,7 +255,7 @@ class TestMail(TestCase): self.assertEqual(self.async_task.call_count, 0) self.assertEqual(len(self.bogus_mailbox.messages), 3) - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) self.assertEqual(self.async_task.call_count, 2) self.assertEqual(len(self.bogus_mailbox.messages), 1) @@ -258,11 +264,13 @@ class TestMail(TestCase): rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice") + self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(self.async_task.call_count, 0) self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2) - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) self.assertEqual(self.async_task.call_count, 1) self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1) + self.assertEqual(len(self.bogus_mailbox.messages), 3) def test_handle_mail_account_move(self): account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret") @@ -272,7 +280,7 @@ class TestMail(TestCase): self.assertEqual(self.async_task.call_count, 0) self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(len(self.bogus_mailbox.messages_spam), 0) - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) self.assertEqual(self.async_task.call_count, 1) self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(len(self.bogus_mailbox.messages_spam), 1) @@ -281,7 +289,7 @@ class TestMail(TestCase): account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong") try: - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) except MailError as e: self.assertTrue(str(e).startswith("Error while authenticating account")) else: @@ -291,7 +299,7 @@ class TestMail(TestCase): rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh") try: - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) except MailError as e: self.assertTrue("uuuh does not exist" in str(e)) else: @@ -302,7 +310,7 @@ class TestMail(TestCase): rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim") try: - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) except MailError as e: self.assertTrue("Error while processing post-consume actions" in str(e)) else: @@ -316,7 +324,7 @@ class TestMail(TestCase): self.assertEqual(self.async_task.call_count, 0) self.assertEqual(len(self.bogus_mailbox.messages), 3) - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(self.async_task.call_count, 1) @@ -326,7 +334,7 @@ class TestMail(TestCase): rule.filter_body = "electronic" rule.save() self.assertEqual(len(self.bogus_mailbox.messages), 3) - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(self.async_task.call_count, 2) @@ -336,7 +344,7 @@ class TestMail(TestCase): rule.filter_body = None rule.save() self.assertEqual(len(self.bogus_mailbox.messages), 3) - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) self.assertEqual(len(self.bogus_mailbox.messages), 1) self.assertEqual(self.async_task.call_count, 4) @@ -347,6 +355,6 @@ class TestMail(TestCase): rule.filter_subject = "Invoice" rule.save() self.assertEqual(len(self.bogus_mailbox.messages), 3) - handle_mail_account(account) + self.mail_account_handler.handle_mail_account(account) self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(self.async_task.call_count, 5) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index d07f9e4b3..73b2414d5 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -86,7 +86,7 @@ class RasterisedDocumentParser(DocumentParser): return self._text if not settings.OCR_ALWAYS and self._is_ocred(): - self.log("info", "Skipping OCR, using Text from PDF") + self.log("debug", "Skipping OCR, using Text from PDF") self._text = get_text_from_pdf(self.document_path) return self._text @@ -98,7 +98,7 @@ class RasterisedDocumentParser(DocumentParser): try: sample_page_index = int(len(images) / 2) - self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images))) + self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images))) sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0] guessed_language = self._guess_language(sample_page_text) @@ -107,7 +107,7 @@ class RasterisedDocumentParser(DocumentParser): ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) elif ISO639[guessed_language] == settings.OCR_LANGUAGE: - self.log("info", "Detected language: {} (default language)".format(guessed_language)) + self.log("debug", "Detected language: {} (default language)".format(guessed_language)) ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): @@ -115,10 +115,10 @@ class RasterisedDocumentParser(DocumentParser): ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) else: - self.log("info", "Detected language: {}".format(guessed_language)) + self.log("debug", "Detected language: {}".format(guessed_language)) ocr_pages = self._ocr(images, ISO639[guessed_language]) - self.log("info", "OCR completed.") + self.log("debug", "OCR completed.") self._text = strip_excess_whitespace(" ".join(ocr_pages)) return self._text @@ -130,7 +130,7 @@ class RasterisedDocumentParser(DocumentParser): Greyscale images are easier for Tesseract to OCR """ - self.log("info", "Converting document {} into greyscale images...".format(self.document_path)) + self.log("debug", "Converting document {} into greyscale images...".format(self.document_path)) # Convert PDF to multiple PNMs pnm = os.path.join(self.tempdir, "convert-%04d.pnm") @@ -148,7 +148,7 @@ class RasterisedDocumentParser(DocumentParser): if f.endswith(".pnm"): pnms.append(os.path.join(self.tempdir, f)) - self.log("info", "Running unpaper on {} pages...".format(len(pnms))) + self.log("debug", "Running unpaper on {} pages...".format(len(pnms))) # Run unpaper in parallel on converted images with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: @@ -161,11 +161,11 @@ class RasterisedDocumentParser(DocumentParser): guess = langdetect.detect(text) return guess except Exception as e: - self.log('debug', "Language detection failed with: {}".format(e)) + self.log('warning', "Language detection failed with: {}".format(e)) return None def _ocr(self, imgs, lang): - self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang)) + self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang)) with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: r = pool.map(image_to_string, itertools.product(imgs, [lang])) return r @@ -180,7 +180,7 @@ class RasterisedDocumentParser(DocumentParser): images_copy = list(images) del images_copy[sample_page_index] if images_copy: - self.log('info', 'Continuing ocr with default language.') + self.log('debug', 'Continuing ocr with default language.') ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) ocr_pages.insert(sample_page_index, sample_page) return ocr_pages