updated logging, logging for the mail consumer to see whats happening

This commit is contained in:
Jonas Winkler 2020-11-18 13:23:30 +01:00
parent 7408d3855c
commit 680ab3d56b
10 changed files with 214 additions and 152 deletions

View File

@ -175,8 +175,6 @@ then put the path to that script in ``paperless.conf`` with the variable name
of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or
``PAPERLESS_POST_CONSUME_SCRIPT``. ``PAPERLESS_POST_CONSUME_SCRIPT``.
.. TODO HYPEREF TO CONFIG
.. important:: .. important::
These scripts are executed in a **blocking** process, which means that if These scripts are executed in a **blocking** process, which means that if
@ -319,6 +317,6 @@ for use in filenames.
.. code:: .. code::
PAPERLESS_FILENAME_FORMAT=../../my/custom/location/{title} PAPERLESS_FILENAME_FORMAT=../../my/custom/location/{title}
However, keep in mind that inside docker, if files get stored outside of the However, keep in mind that inside docker, if files get stored outside of the
predefined volumes, they will be lost after a restart of paperless. predefined volumes, they will be lost after a restart of paperless.

View File

@ -12,6 +12,7 @@ from django.utils import timezone
from paperless.db import GnuPG from paperless.db import GnuPG
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
from .file_handling import generate_filename, create_source_path_directory from .file_handling import generate_filename, create_source_path_directory
from .loggers import LoggingMixin
from .models import Document, FileInfo, Correspondent, DocumentType, Tag from .models import Document, FileInfo, Correspondent, DocumentType, Tag
from .parsers import ParseError, get_parser_class from .parsers import ParseError, get_parser_class
from .signals import ( from .signals import (
@ -24,12 +25,10 @@ class ConsumerError(Exception):
pass pass
class Consumer: class Consumer(LoggingMixin):
def __init__(self): def __init__(self):
super().__init__()
self.logger = logging.getLogger(__name__)
self.logging_group = None
self.path = None self.path = None
self.filename = None self.filename = None
self.override_title = None self.override_title = None
@ -74,11 +73,6 @@ class Consumer:
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def try_consume_file(self, def try_consume_file(self,
path, path,
override_filename=None, override_filename=None,
@ -100,7 +94,7 @@ class Consumer:
# this is for grouping logging entries for this particular file # this is for grouping logging entries for this particular file
# together. # together.
self.logging_group = uuid.uuid4() self.renew_logging_group()
# Make sure that preconditions for consuming the file are met. # Make sure that preconditions for consuming the file are met.

View File

@ -1,4 +1,5 @@
import logging import logging
import uuid
class PaperlessHandler(logging.Handler): class PaperlessHandler(logging.Handler):
@ -13,3 +14,19 @@ class PaperlessHandler(logging.Handler):
kwargs["group"] = record.group kwargs["group"] = record.group
Log.objects.create(**kwargs) Log.objects.create(**kwargs)
class LoggingMixin:
logging_group = None
def renew_logging_group(self):
self.logging_group = uuid.uuid4()
def log(self, level, message):
target = ".".join([self.__class__.__module__, self.__class__.__name__])
logger = logging.getLogger(target)
getattr(logger, level)(message, extra={
"group": self.logging_group
})

View File

@ -20,6 +20,7 @@ from django.utils import timezone
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits # - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits # - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits # - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration from documents.signals import document_consumer_declaration
# TODO: isnt there a date parsing library for this? # TODO: isnt there a date parsing library for this?
@ -101,17 +102,17 @@ class ParseError(Exception):
pass pass
class DocumentParser: class DocumentParser(LoggingMixin):
""" """
Subclass this to make your own parser. Have a look at Subclass this to make your own parser. Have a look at
`paperless_tesseract.parsers` for inspiration. `paperless_tesseract.parsers` for inspiration.
""" """
def __init__(self, path, logging_group): def __init__(self, path, logging_group):
super().__init__()
self.logging_group = logging_group
self.document_path = path self.document_path = path
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
self.logger = logging.getLogger(__name__)
self.logging_group = logging_group
def get_thumbnail(self): def get_thumbnail(self):
""" """
@ -222,11 +223,6 @@ class DocumentParser:
return date return date
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def cleanup(self): def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir)) self.log("debug", "Deleting directory {}".format(self.tempdir))
shutil.rmtree(self.tempdir) shutil.rmtree(self.tempdir)

View File

@ -257,6 +257,14 @@ LOGGING = {
"handlers": ["dbhandler", "streamhandler"], "handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG" "level": "DEBUG"
}, },
"paperless_mail": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
"paperless_tesseract": {
"handlers": ["dbhandler", "streamhandler"],
"level": "DEBUG"
},
}, },
} }

View File

@ -1,18 +1,7 @@
from django.contrib import admin from django.contrib import admin
from django import forms
from paperless_mail.models import MailAccount, MailRule from paperless_mail.models import MailAccount, MailRule
class MailAccountForm(forms.ModelForm):
password = forms.CharField(widget=forms.PasswordInput)
class Meta:
fields = '__all__'
model = MailAccount
class MailAccountAdmin(admin.ModelAdmin): class MailAccountAdmin(admin.ModelAdmin):
list_display = ("name", "imap_server", "username") list_display = ("name", "imap_server", "username")

View File

@ -8,6 +8,7 @@ from django_q.tasks import async_task
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \ from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
MailboxFolderSelectError MailboxFolderSelectError
from documents.loggers import LoggingMixin
from documents.models import Correspondent from documents.models import Correspondent
from paperless_mail.models import MailAccount, MailRule from paperless_mail.models import MailAccount, MailRule
@ -83,72 +84,6 @@ def make_criterias(rule):
return {**criterias, **get_rule_action(rule).get_criteria()} return {**criterias, **get_rule_action(rule).get_criteria()}
def handle_mail_account(account):
if account.imap_security == MailAccount.IMAP_SECURITY_NONE:
mailbox = MailBoxUnencrypted(account.imap_server, account.imap_port)
elif account.imap_security == MailAccount.IMAP_SECURITY_STARTTLS:
mailbox = MailBox(account.imap_server, account.imap_port, starttls=True)
elif account.imap_security == MailAccount.IMAP_SECURITY_SSL:
mailbox = MailBox(account.imap_server, account.imap_port)
else:
raise ValueError("Unknown IMAP security")
total_processed_files = 0
with mailbox as M:
try:
M.login(account.username, account.password)
except Exception:
raise MailError(
f"Error while authenticating account {account.name}")
for rule in account.rules.all():
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
f"Rule {rule.name}: Folder {rule.folder} does not exist "
f"in account {account.name}")
criterias = make_criterias(rule)
try:
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while fetching folder "
f"{rule.folder} of account {account.name}")
post_consume_messages = []
for message in messages:
try:
processed_files = handle_message(message, rule)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing mail "
f"{message.uid} of account {account.name}")
if processed_files > 0:
post_consume_messages.append(message.uid)
total_processed_files += processed_files
try:
get_rule_action(rule).post_consume(
M,
post_consume_messages,
rule.action_parameter)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing post-consume "
f"actions for account {account.name}")
return total_processed_files
def get_title(message, att, rule): def get_title(message, att, rule):
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT: if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
title = message.subject title = message.subject
@ -189,39 +124,155 @@ def get_correspondent(message, rule):
return correspondent return correspondent
def handle_message(message, rule): def get_mailbox(server, port, security):
if not message.attachments: if security == MailAccount.IMAP_SECURITY_NONE:
return 0 mailbox = MailBoxUnencrypted(server, port)
elif security == MailAccount.IMAP_SECURITY_STARTTLS:
mailbox = MailBox(server, port, starttls=True)
elif security == MailAccount.IMAP_SECURITY_SSL:
mailbox = MailBox(server, port)
else:
raise ValueError("Unknown IMAP security")
return mailbox
correspondent = get_correspondent(message, rule) class MailAccountHandler(LoggingMixin):
tag = rule.assign_tag
doc_type = rule.assign_document_type
processed_attachments = 0 def handle_mail_account(self, account):
for att in message.attachments: self.renew_logging_group()
title = get_title(message, att, rule) self.log('debug', f"Processing mail account {account}")
# TODO: check with parsers what files types are supported total_processed_files = 0
if att.content_type == 'application/pdf':
os.makedirs(settings.SCRATCH_DIR, exist_ok=True) with get_mailbox(account.imap_server,
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR) account.imap_port,
with open(temp_filename, 'wb') as f: account.imap_security) as M:
f.write(att.payload)
async_task( try:
"documents.tasks.consume_file", M.login(account.username, account.password)
path=temp_filename, except Exception:
override_filename=att.filename, raise MailError(
override_title=title, f"Error while authenticating account {account.name}")
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=[tag.id] if tag else None,
task_name=f"Mail: {att.filename}"
)
processed_attachments += 1 self.log('debug', f"Account {account}: Processing "
f"{account.rules.count()} rule(s)")
return processed_attachments for rule in account.rules.all():
self.log(
'debug',
f"Account {account}: Processing rule {rule.name}")
self.log(
'debug',
f"Rule {account}.{rule}: Selecting folder {rule.folder}")
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
raise MailError(
f"Rule {rule.name}: Folder {rule.folder} does not exist "
f"in account {account.name}")
criterias = make_criterias(rule)
self.log(
'debug',
f"Rule {account}.{rule}: Searching folder with criteria "
f"{str(AND(**criterias))}")
try:
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while fetching folder "
f"{rule.folder} of account {account.name}")
post_consume_messages = []
mails_processed = 0
for message in messages:
try:
processed_files = self.handle_message(message, rule)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing mail "
f"{message.uid} of account {account.name}")
if processed_files > 0:
post_consume_messages.append(message.uid)
total_processed_files += processed_files
mails_processed += 1
self.log(
'debug',
f"Rule {account}.{rule}: Processed {mails_processed} "
f"matching mail(s)")
self.log(
'debug',
f"Rule {account}.{rule}: Running mail actions on "
f"{len(post_consume_messages)} mails")
try:
get_rule_action(rule).post_consume(
M,
post_consume_messages,
rule.action_parameter)
except Exception:
raise MailError(
f"Rule {rule.name}: Error while processing post-consume "
f"actions for account {account.name}")
return total_processed_files
def handle_message(self, message, rule):
if not message.attachments:
return 0
self.log(
'debug',
f"Rule {rule.account}.{rule}: "
f"Processing mail {message.subject} from {message.from_} with "
f"{len(message.attachments)} attachment(s)")
correspondent = get_correspondent(message, rule)
tag = rule.assign_tag
doc_type = rule.assign_document_type
processed_attachments = 0
for att in message.attachments:
title = get_title(message, att, rule)
# TODO: check with parsers what files types are supported
if att.content_type == 'application/pdf':
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
with open(temp_filename, 'wb') as f:
f.write(att.payload)
self.log(
'info',
f"Rule {rule.account}.{rule}: "
f"Consuming attachment {att.filename} from mail "
f"{message.subject} from {message.from_}")
async_task(
"documents.tasks.consume_file",
path=temp_filename,
override_filename=att.filename,
override_title=title,
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=[tag.id] if tag else None,
task_name=f"Mail: {att.filename}"
)
processed_attachments += 1
return processed_attachments

View File

@ -1,13 +1,14 @@
import logging import logging
from paperless_mail import mail from paperless_mail import mail
from paperless_mail.mail import MailAccountHandler
from paperless_mail.models import MailAccount from paperless_mail.models import MailAccount
def process_mail_accounts(): def process_mail_accounts():
total_new_documents = 0 total_new_documents = 0
for account in MailAccount.objects.all(): for account in MailAccount.objects.all():
total_new_documents += mail.handle_mail_account(account) total_new_documents += MailAccountHandler().handle_mail_account(account)
if total_new_documents > 0: if total_new_documents > 0:
return f"Added {total_new_documents} document(s)." return f"Added {total_new_documents} document(s)."
@ -18,6 +19,6 @@ def process_mail_accounts():
def process_mail_account(name): def process_mail_account(name):
account = MailAccount.objects.find(name=name) account = MailAccount.objects.find(name=name)
if account: if account:
mail.handle_mail_account(account) MailAccountHandler().handle_mail_account(account)
else: else:
logging.error("Unknown mail acccount: {}".format(name)) logging.error("Unknown mail acccount: {}".format(name))

View File

@ -7,7 +7,7 @@ from django.test import TestCase
from imap_tools import MailMessageFlags, MailboxFolderSelectError from imap_tools import MailMessageFlags, MailboxFolderSelectError
from documents.models import Correspondent from documents.models import Correspondent
from paperless_mail.mail import get_correspondent, get_title, handle_message, handle_mail_account, MailError from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
from paperless_mail.models import MailRule, MailAccount from paperless_mail.models import MailRule, MailAccount
@ -126,6 +126,8 @@ class TestMail(TestCase):
self.reset_bogus_mailbox() self.reset_bogus_mailbox()
self.mail_account_handler = MailAccountHandler()
def reset_bogus_mailbox(self): def reset_bogus_mailbox(self):
self.bogus_mailbox.messages = [] self.bogus_mailbox.messages = []
self.bogus_mailbox.messages_spam = [] self.bogus_mailbox.messages_spam = []
@ -182,6 +184,7 @@ class TestMail(TestCase):
def test_handle_message(self): def test_handle_message(self):
message = namedtuple('MailMessage', []) message = namedtuple('MailMessage', [])
message.subject = "the message title" message.subject = "the message title"
message.from_ = "Myself"
att = namedtuple('Attachment', []) att = namedtuple('Attachment', [])
att.filename = "test1.pdf" att.filename = "test1.pdf"
@ -200,9 +203,10 @@ class TestMail(TestCase):
message.attachments = [att, att2, att3] message.attachments = [att, att2, att3]
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME) account = MailAccount()
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
result = handle_message(message, rule) result = self.mail_account_handler.handle_message(message, rule)
self.assertEqual(result, 2) self.assertEqual(result, 2)
@ -224,7 +228,7 @@ class TestMail(TestCase):
message.attachments = [] message.attachments = []
rule = MailRule() rule = MailRule()
result = handle_message(message, rule) result = self.mail_account_handler.handle_message(message, rule)
self.assertFalse(m.called) self.assertFalse(m.called)
self.assertEqual(result, 0) self.assertEqual(result, 0)
@ -235,11 +239,13 @@ class TestMail(TestCase):
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ) rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(self.async_task.call_count, 0) self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2) self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2) self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0) self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_handle_mail_account_delete(self): def test_handle_mail_account_delete(self):
@ -249,7 +255,7 @@ class TestMail(TestCase):
self.assertEqual(self.async_task.call_count, 0) self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2) self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.messages), 1) self.assertEqual(len(self.bogus_mailbox.messages), 1)
@ -258,11 +264,13 @@ class TestMail(TestCase):
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice") rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(self.async_task.call_count, 0) self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2) self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1) self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1) self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_handle_mail_account_move(self): def test_handle_mail_account_move(self):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret") account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
@ -272,7 +280,7 @@ class TestMail(TestCase):
self.assertEqual(self.async_task.call_count, 0) self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0) self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 1) self.assertEqual(self.async_task.call_count, 1)
self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1) self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
@ -281,7 +289,7 @@ class TestMail(TestCase):
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong") account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
try: try:
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
except MailError as e: except MailError as e:
self.assertTrue(str(e).startswith("Error while authenticating account")) self.assertTrue(str(e).startswith("Error while authenticating account"))
else: else:
@ -291,7 +299,7 @@ class TestMail(TestCase):
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh") rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
try: try:
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
except MailError as e: except MailError as e:
self.assertTrue("uuuh does not exist" in str(e)) self.assertTrue("uuuh does not exist" in str(e))
else: else:
@ -302,7 +310,7 @@ class TestMail(TestCase):
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim") rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
try: try:
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
except MailError as e: except MailError as e:
self.assertTrue("Error while processing post-consume actions" in str(e)) self.assertTrue("Error while processing post-consume actions" in str(e))
else: else:
@ -316,7 +324,7 @@ class TestMail(TestCase):
self.assertEqual(self.async_task.call_count, 0) self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 1) self.assertEqual(self.async_task.call_count, 1)
@ -326,7 +334,7 @@ class TestMail(TestCase):
rule.filter_body = "electronic" rule.filter_body = "electronic"
rule.save() rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 2) self.assertEqual(self.async_task.call_count, 2)
@ -336,7 +344,7 @@ class TestMail(TestCase):
rule.filter_body = None rule.filter_body = None
rule.save() rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 1) self.assertEqual(len(self.bogus_mailbox.messages), 1)
self.assertEqual(self.async_task.call_count, 4) self.assertEqual(self.async_task.call_count, 4)
@ -347,6 +355,6 @@ class TestMail(TestCase):
rule.filter_subject = "Invoice" rule.filter_subject = "Invoice"
rule.save() rule.save()
self.assertEqual(len(self.bogus_mailbox.messages), 3) self.assertEqual(len(self.bogus_mailbox.messages), 3)
handle_mail_account(account) self.mail_account_handler.handle_mail_account(account)
self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(len(self.bogus_mailbox.messages), 2)
self.assertEqual(self.async_task.call_count, 5) self.assertEqual(self.async_task.call_count, 5)

View File

@ -86,7 +86,7 @@ class RasterisedDocumentParser(DocumentParser):
return self._text return self._text
if not settings.OCR_ALWAYS and self._is_ocred(): if not settings.OCR_ALWAYS and self._is_ocred():
self.log("info", "Skipping OCR, using Text from PDF") self.log("debug", "Skipping OCR, using Text from PDF")
self._text = get_text_from_pdf(self.document_path) self._text = get_text_from_pdf(self.document_path)
return self._text return self._text
@ -98,7 +98,7 @@ class RasterisedDocumentParser(DocumentParser):
try: try:
sample_page_index = int(len(images) / 2) sample_page_index = int(len(images) / 2)
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images))) self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0] sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text) guessed_language = self._guess_language(sample_page_text)
@ -107,7 +107,7 @@ class RasterisedDocumentParser(DocumentParser):
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
elif ISO639[guessed_language] == settings.OCR_LANGUAGE: elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
self.log("info", "Detected language: {} (default language)".format(guessed_language)) self.log("debug", "Detected language: {} (default language)".format(guessed_language))
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
@ -115,10 +115,10 @@ class RasterisedDocumentParser(DocumentParser):
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text) ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
else: else:
self.log("info", "Detected language: {}".format(guessed_language)) self.log("debug", "Detected language: {}".format(guessed_language))
ocr_pages = self._ocr(images, ISO639[guessed_language]) ocr_pages = self._ocr(images, ISO639[guessed_language])
self.log("info", "OCR completed.") self.log("debug", "OCR completed.")
self._text = strip_excess_whitespace(" ".join(ocr_pages)) self._text = strip_excess_whitespace(" ".join(ocr_pages))
return self._text return self._text
@ -130,7 +130,7 @@ class RasterisedDocumentParser(DocumentParser):
Greyscale images are easier for Tesseract to OCR Greyscale images are easier for Tesseract to OCR
""" """
self.log("info", "Converting document {} into greyscale images...".format(self.document_path)) self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
# Convert PDF to multiple PNMs # Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm") pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
@ -148,7 +148,7 @@ class RasterisedDocumentParser(DocumentParser):
if f.endswith(".pnm"): if f.endswith(".pnm"):
pnms.append(os.path.join(self.tempdir, f)) pnms.append(os.path.join(self.tempdir, f))
self.log("info", "Running unpaper on {} pages...".format(len(pnms))) self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
# Run unpaper in parallel on converted images # Run unpaper in parallel on converted images
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
@ -161,11 +161,11 @@ class RasterisedDocumentParser(DocumentParser):
guess = langdetect.detect(text) guess = langdetect.detect(text)
return guess return guess
except Exception as e: except Exception as e:
self.log('debug', "Language detection failed with: {}".format(e)) self.log('warning', "Language detection failed with: {}".format(e))
return None return None
def _ocr(self, imgs, lang): def _ocr(self, imgs, lang):
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang)) self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang])) r = pool.map(image_to_string, itertools.product(imgs, [lang]))
return r return r
@ -180,7 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
images_copy = list(images) images_copy = list(images)
del images_copy[sample_page_index] del images_copy[sample_page_index]
if images_copy: if images_copy:
self.log('info', 'Continuing ocr with default language.') self.log('debug', 'Continuing ocr with default language.')
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
ocr_pages.insert(sample_page_index, sample_page) ocr_pages.insert(sample_page_index, sample_page)
return ocr_pages return ocr_pages