mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
updated logging, logging for the mail consumer to see whats happening
This commit is contained in:
parent
1b0233418b
commit
8908bc259e
@ -175,8 +175,6 @@ then put the path to that script in ``paperless.conf`` with the variable name
|
|||||||
of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or
|
of either ``PAPERLESS_PRE_CONSUME_SCRIPT`` or
|
||||||
``PAPERLESS_POST_CONSUME_SCRIPT``.
|
``PAPERLESS_POST_CONSUME_SCRIPT``.
|
||||||
|
|
||||||
.. TODO HYPEREF TO CONFIG
|
|
||||||
|
|
||||||
.. important::
|
.. important::
|
||||||
|
|
||||||
These scripts are executed in a **blocking** process, which means that if
|
These scripts are executed in a **blocking** process, which means that if
|
||||||
|
@ -12,6 +12,7 @@ from django.utils import timezone
|
|||||||
from paperless.db import GnuPG
|
from paperless.db import GnuPG
|
||||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||||
from .file_handling import generate_filename, create_source_path_directory
|
from .file_handling import generate_filename, create_source_path_directory
|
||||||
|
from .loggers import LoggingMixin
|
||||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||||
from .parsers import ParseError, get_parser_class
|
from .parsers import ParseError, get_parser_class
|
||||||
from .signals import (
|
from .signals import (
|
||||||
@ -24,12 +25,10 @@ class ConsumerError(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Consumer:
|
class Consumer(LoggingMixin):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
self.logging_group = None
|
|
||||||
self.path = None
|
self.path = None
|
||||||
self.filename = None
|
self.filename = None
|
||||||
self.override_title = None
|
self.override_title = None
|
||||||
@ -74,11 +73,6 @@ class Consumer:
|
|||||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||||
|
|
||||||
def log(self, level, message):
|
|
||||||
getattr(self.logger, level)(message, extra={
|
|
||||||
"group": self.logging_group
|
|
||||||
})
|
|
||||||
|
|
||||||
def try_consume_file(self,
|
def try_consume_file(self,
|
||||||
path,
|
path,
|
||||||
override_filename=None,
|
override_filename=None,
|
||||||
@ -100,7 +94,7 @@ class Consumer:
|
|||||||
# this is for grouping logging entries for this particular file
|
# this is for grouping logging entries for this particular file
|
||||||
# together.
|
# together.
|
||||||
|
|
||||||
self.logging_group = uuid.uuid4()
|
self.renew_logging_group()
|
||||||
|
|
||||||
# Make sure that preconditions for consuming the file are met.
|
# Make sure that preconditions for consuming the file are met.
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
class PaperlessHandler(logging.Handler):
|
class PaperlessHandler(logging.Handler):
|
||||||
@ -13,3 +14,19 @@ class PaperlessHandler(logging.Handler):
|
|||||||
kwargs["group"] = record.group
|
kwargs["group"] = record.group
|
||||||
|
|
||||||
Log.objects.create(**kwargs)
|
Log.objects.create(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class LoggingMixin:
|
||||||
|
|
||||||
|
logging_group = None
|
||||||
|
|
||||||
|
def renew_logging_group(self):
|
||||||
|
self.logging_group = uuid.uuid4()
|
||||||
|
|
||||||
|
def log(self, level, message):
|
||||||
|
target = ".".join([self.__class__.__module__, self.__class__.__name__])
|
||||||
|
logger = logging.getLogger(target)
|
||||||
|
|
||||||
|
getattr(logger, level)(message, extra={
|
||||||
|
"group": self.logging_group
|
||||||
|
})
|
||||||
|
@ -20,6 +20,7 @@ from django.utils import timezone
|
|||||||
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
|
||||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||||
|
from documents.loggers import LoggingMixin
|
||||||
from documents.signals import document_consumer_declaration
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
# TODO: isnt there a date parsing library for this?
|
# TODO: isnt there a date parsing library for this?
|
||||||
@ -101,17 +102,17 @@ class ParseError(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class DocumentParser:
|
class DocumentParser(LoggingMixin):
|
||||||
"""
|
"""
|
||||||
Subclass this to make your own parser. Have a look at
|
Subclass this to make your own parser. Have a look at
|
||||||
`paperless_tesseract.parsers` for inspiration.
|
`paperless_tesseract.parsers` for inspiration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, path, logging_group):
|
def __init__(self, path, logging_group):
|
||||||
|
super().__init__()
|
||||||
|
self.logging_group = logging_group
|
||||||
self.document_path = path
|
self.document_path = path
|
||||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
self.logging_group = logging_group
|
|
||||||
|
|
||||||
def get_thumbnail(self):
|
def get_thumbnail(self):
|
||||||
"""
|
"""
|
||||||
@ -222,11 +223,6 @@ class DocumentParser:
|
|||||||
|
|
||||||
return date
|
return date
|
||||||
|
|
||||||
def log(self, level, message):
|
|
||||||
getattr(self.logger, level)(message, extra={
|
|
||||||
"group": self.logging_group
|
|
||||||
})
|
|
||||||
|
|
||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||||
shutil.rmtree(self.tempdir)
|
shutil.rmtree(self.tempdir)
|
||||||
|
@ -257,6 +257,14 @@ LOGGING = {
|
|||||||
"handlers": ["dbhandler", "streamhandler"],
|
"handlers": ["dbhandler", "streamhandler"],
|
||||||
"level": "DEBUG"
|
"level": "DEBUG"
|
||||||
},
|
},
|
||||||
|
"paperless_mail": {
|
||||||
|
"handlers": ["dbhandler", "streamhandler"],
|
||||||
|
"level": "DEBUG"
|
||||||
|
},
|
||||||
|
"paperless_tesseract": {
|
||||||
|
"handlers": ["dbhandler", "streamhandler"],
|
||||||
|
"level": "DEBUG"
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,18 +1,7 @@
|
|||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django import forms
|
|
||||||
|
|
||||||
from paperless_mail.models import MailAccount, MailRule
|
from paperless_mail.models import MailAccount, MailRule
|
||||||
|
|
||||||
|
|
||||||
class MailAccountForm(forms.ModelForm):
|
|
||||||
|
|
||||||
password = forms.CharField(widget=forms.PasswordInput)
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
fields = '__all__'
|
|
||||||
model = MailAccount
|
|
||||||
|
|
||||||
|
|
||||||
class MailAccountAdmin(admin.ModelAdmin):
|
class MailAccountAdmin(admin.ModelAdmin):
|
||||||
|
|
||||||
list_display = ("name", "imap_server", "username")
|
list_display = ("name", "imap_server", "username")
|
||||||
|
@ -8,6 +8,7 @@ from django_q.tasks import async_task
|
|||||||
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
|
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
|
||||||
MailboxFolderSelectError
|
MailboxFolderSelectError
|
||||||
|
|
||||||
|
from documents.loggers import LoggingMixin
|
||||||
from documents.models import Correspondent
|
from documents.models import Correspondent
|
||||||
from paperless_mail.models import MailAccount, MailRule
|
from paperless_mail.models import MailAccount, MailRule
|
||||||
|
|
||||||
@ -83,72 +84,6 @@ def make_criterias(rule):
|
|||||||
return {**criterias, **get_rule_action(rule).get_criteria()}
|
return {**criterias, **get_rule_action(rule).get_criteria()}
|
||||||
|
|
||||||
|
|
||||||
def handle_mail_account(account):
|
|
||||||
|
|
||||||
if account.imap_security == MailAccount.IMAP_SECURITY_NONE:
|
|
||||||
mailbox = MailBoxUnencrypted(account.imap_server, account.imap_port)
|
|
||||||
elif account.imap_security == MailAccount.IMAP_SECURITY_STARTTLS:
|
|
||||||
mailbox = MailBox(account.imap_server, account.imap_port, starttls=True)
|
|
||||||
elif account.imap_security == MailAccount.IMAP_SECURITY_SSL:
|
|
||||||
mailbox = MailBox(account.imap_server, account.imap_port)
|
|
||||||
else:
|
|
||||||
raise ValueError("Unknown IMAP security")
|
|
||||||
|
|
||||||
total_processed_files = 0
|
|
||||||
|
|
||||||
with mailbox as M:
|
|
||||||
|
|
||||||
try:
|
|
||||||
M.login(account.username, account.password)
|
|
||||||
except Exception:
|
|
||||||
raise MailError(
|
|
||||||
f"Error while authenticating account {account.name}")
|
|
||||||
|
|
||||||
for rule in account.rules.all():
|
|
||||||
|
|
||||||
try:
|
|
||||||
M.folder.set(rule.folder)
|
|
||||||
except MailboxFolderSelectError:
|
|
||||||
raise MailError(
|
|
||||||
f"Rule {rule.name}: Folder {rule.folder} does not exist "
|
|
||||||
f"in account {account.name}")
|
|
||||||
|
|
||||||
criterias = make_criterias(rule)
|
|
||||||
|
|
||||||
try:
|
|
||||||
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
|
|
||||||
except Exception:
|
|
||||||
raise MailError(
|
|
||||||
f"Rule {rule.name}: Error while fetching folder "
|
|
||||||
f"{rule.folder} of account {account.name}")
|
|
||||||
|
|
||||||
post_consume_messages = []
|
|
||||||
|
|
||||||
for message in messages:
|
|
||||||
try:
|
|
||||||
processed_files = handle_message(message, rule)
|
|
||||||
except Exception:
|
|
||||||
raise MailError(
|
|
||||||
f"Rule {rule.name}: Error while processing mail "
|
|
||||||
f"{message.uid} of account {account.name}")
|
|
||||||
if processed_files > 0:
|
|
||||||
post_consume_messages.append(message.uid)
|
|
||||||
|
|
||||||
total_processed_files += processed_files
|
|
||||||
try:
|
|
||||||
get_rule_action(rule).post_consume(
|
|
||||||
M,
|
|
||||||
post_consume_messages,
|
|
||||||
rule.action_parameter)
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
raise MailError(
|
|
||||||
f"Rule {rule.name}: Error while processing post-consume "
|
|
||||||
f"actions for account {account.name}")
|
|
||||||
|
|
||||||
return total_processed_files
|
|
||||||
|
|
||||||
|
|
||||||
def get_title(message, att, rule):
|
def get_title(message, att, rule):
|
||||||
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
||||||
title = message.subject
|
title = message.subject
|
||||||
@ -189,39 +124,155 @@ def get_correspondent(message, rule):
|
|||||||
return correspondent
|
return correspondent
|
||||||
|
|
||||||
|
|
||||||
def handle_message(message, rule):
|
def get_mailbox(server, port, security):
|
||||||
if not message.attachments:
|
if security == MailAccount.IMAP_SECURITY_NONE:
|
||||||
return 0
|
mailbox = MailBoxUnencrypted(server, port)
|
||||||
|
elif security == MailAccount.IMAP_SECURITY_STARTTLS:
|
||||||
|
mailbox = MailBox(server, port, starttls=True)
|
||||||
|
elif security == MailAccount.IMAP_SECURITY_SSL:
|
||||||
|
mailbox = MailBox(server, port)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown IMAP security")
|
||||||
|
return mailbox
|
||||||
|
|
||||||
correspondent = get_correspondent(message, rule)
|
class MailAccountHandler(LoggingMixin):
|
||||||
tag = rule.assign_tag
|
|
||||||
doc_type = rule.assign_document_type
|
|
||||||
|
|
||||||
processed_attachments = 0
|
def handle_mail_account(self, account):
|
||||||
|
|
||||||
for att in message.attachments:
|
self.renew_logging_group()
|
||||||
|
|
||||||
title = get_title(message, att, rule)
|
self.log('debug', f"Processing mail account {account}")
|
||||||
|
|
||||||
# TODO: check with parsers what files types are supported
|
total_processed_files = 0
|
||||||
if att.content_type == 'application/pdf':
|
|
||||||
|
|
||||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
with get_mailbox(account.imap_server,
|
||||||
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
|
account.imap_port,
|
||||||
with open(temp_filename, 'wb') as f:
|
account.imap_security) as M:
|
||||||
f.write(att.payload)
|
|
||||||
|
|
||||||
async_task(
|
try:
|
||||||
"documents.tasks.consume_file",
|
M.login(account.username, account.password)
|
||||||
path=temp_filename,
|
except Exception:
|
||||||
override_filename=att.filename,
|
raise MailError(
|
||||||
override_title=title,
|
f"Error while authenticating account {account.name}")
|
||||||
override_correspondent_id=correspondent.id if correspondent else None,
|
|
||||||
override_document_type_id=doc_type.id if doc_type else None,
|
|
||||||
override_tag_ids=[tag.id] if tag else None,
|
|
||||||
task_name=f"Mail: {att.filename}"
|
|
||||||
)
|
|
||||||
|
|
||||||
processed_attachments += 1
|
self.log('debug', f"Account {account}: Processing "
|
||||||
|
f"{account.rules.count()} rule(s)")
|
||||||
|
|
||||||
return processed_attachments
|
for rule in account.rules.all():
|
||||||
|
self.log(
|
||||||
|
'debug',
|
||||||
|
f"Account {account}: Processing rule {rule.name}")
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
'debug',
|
||||||
|
f"Rule {account}.{rule}: Selecting folder {rule.folder}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
M.folder.set(rule.folder)
|
||||||
|
except MailboxFolderSelectError:
|
||||||
|
raise MailError(
|
||||||
|
f"Rule {rule.name}: Folder {rule.folder} does not exist "
|
||||||
|
f"in account {account.name}")
|
||||||
|
|
||||||
|
criterias = make_criterias(rule)
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
'debug',
|
||||||
|
f"Rule {account}.{rule}: Searching folder with criteria "
|
||||||
|
f"{str(AND(**criterias))}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
messages = M.fetch(criteria=AND(**criterias), mark_seen=False)
|
||||||
|
except Exception:
|
||||||
|
raise MailError(
|
||||||
|
f"Rule {rule.name}: Error while fetching folder "
|
||||||
|
f"{rule.folder} of account {account.name}")
|
||||||
|
|
||||||
|
post_consume_messages = []
|
||||||
|
|
||||||
|
mails_processed = 0
|
||||||
|
|
||||||
|
for message in messages:
|
||||||
|
try:
|
||||||
|
processed_files = self.handle_message(message, rule)
|
||||||
|
except Exception:
|
||||||
|
raise MailError(
|
||||||
|
f"Rule {rule.name}: Error while processing mail "
|
||||||
|
f"{message.uid} of account {account.name}")
|
||||||
|
if processed_files > 0:
|
||||||
|
post_consume_messages.append(message.uid)
|
||||||
|
|
||||||
|
total_processed_files += processed_files
|
||||||
|
mails_processed += 1
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
'debug',
|
||||||
|
f"Rule {account}.{rule}: Processed {mails_processed} "
|
||||||
|
f"matching mail(s)")
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
'debug',
|
||||||
|
f"Rule {account}.{rule}: Running mail actions on "
|
||||||
|
f"{len(post_consume_messages)} mails")
|
||||||
|
|
||||||
|
try:
|
||||||
|
get_rule_action(rule).post_consume(
|
||||||
|
M,
|
||||||
|
post_consume_messages,
|
||||||
|
rule.action_parameter)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
raise MailError(
|
||||||
|
f"Rule {rule.name}: Error while processing post-consume "
|
||||||
|
f"actions for account {account.name}")
|
||||||
|
|
||||||
|
return total_processed_files
|
||||||
|
|
||||||
|
def handle_message(self, message, rule):
|
||||||
|
if not message.attachments:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
'debug',
|
||||||
|
f"Rule {rule.account}.{rule}: "
|
||||||
|
f"Processing mail {message.subject} from {message.from_} with "
|
||||||
|
f"{len(message.attachments)} attachment(s)")
|
||||||
|
|
||||||
|
correspondent = get_correspondent(message, rule)
|
||||||
|
tag = rule.assign_tag
|
||||||
|
doc_type = rule.assign_document_type
|
||||||
|
|
||||||
|
processed_attachments = 0
|
||||||
|
|
||||||
|
for att in message.attachments:
|
||||||
|
|
||||||
|
title = get_title(message, att, rule)
|
||||||
|
|
||||||
|
# TODO: check with parsers what files types are supported
|
||||||
|
if att.content_type == 'application/pdf':
|
||||||
|
|
||||||
|
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||||
|
_, temp_filename = tempfile.mkstemp(prefix="paperless-mail-", dir=settings.SCRATCH_DIR)
|
||||||
|
with open(temp_filename, 'wb') as f:
|
||||||
|
f.write(att.payload)
|
||||||
|
|
||||||
|
self.log(
|
||||||
|
'info',
|
||||||
|
f"Rule {rule.account}.{rule}: "
|
||||||
|
f"Consuming attachment {att.filename} from mail "
|
||||||
|
f"{message.subject} from {message.from_}")
|
||||||
|
|
||||||
|
async_task(
|
||||||
|
"documents.tasks.consume_file",
|
||||||
|
path=temp_filename,
|
||||||
|
override_filename=att.filename,
|
||||||
|
override_title=title,
|
||||||
|
override_correspondent_id=correspondent.id if correspondent else None,
|
||||||
|
override_document_type_id=doc_type.id if doc_type else None,
|
||||||
|
override_tag_ids=[tag.id] if tag else None,
|
||||||
|
task_name=f"Mail: {att.filename}"
|
||||||
|
)
|
||||||
|
|
||||||
|
processed_attachments += 1
|
||||||
|
|
||||||
|
return processed_attachments
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
from paperless_mail import mail
|
from paperless_mail import mail
|
||||||
|
from paperless_mail.mail import MailAccountHandler
|
||||||
from paperless_mail.models import MailAccount
|
from paperless_mail.models import MailAccount
|
||||||
|
|
||||||
|
|
||||||
def process_mail_accounts():
|
def process_mail_accounts():
|
||||||
total_new_documents = 0
|
total_new_documents = 0
|
||||||
for account in MailAccount.objects.all():
|
for account in MailAccount.objects.all():
|
||||||
total_new_documents += mail.handle_mail_account(account)
|
total_new_documents += MailAccountHandler().handle_mail_account(account)
|
||||||
|
|
||||||
if total_new_documents > 0:
|
if total_new_documents > 0:
|
||||||
return f"Added {total_new_documents} document(s)."
|
return f"Added {total_new_documents} document(s)."
|
||||||
@ -18,6 +19,6 @@ def process_mail_accounts():
|
|||||||
def process_mail_account(name):
|
def process_mail_account(name):
|
||||||
account = MailAccount.objects.find(name=name)
|
account = MailAccount.objects.find(name=name)
|
||||||
if account:
|
if account:
|
||||||
mail.handle_mail_account(account)
|
MailAccountHandler().handle_mail_account(account)
|
||||||
else:
|
else:
|
||||||
logging.error("Unknown mail acccount: {}".format(name))
|
logging.error("Unknown mail acccount: {}".format(name))
|
||||||
|
@ -7,7 +7,7 @@ from django.test import TestCase
|
|||||||
from imap_tools import MailMessageFlags, MailboxFolderSelectError
|
from imap_tools import MailMessageFlags, MailboxFolderSelectError
|
||||||
|
|
||||||
from documents.models import Correspondent
|
from documents.models import Correspondent
|
||||||
from paperless_mail.mail import get_correspondent, get_title, handle_message, handle_mail_account, MailError
|
from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
|
||||||
from paperless_mail.models import MailRule, MailAccount
|
from paperless_mail.models import MailRule, MailAccount
|
||||||
|
|
||||||
|
|
||||||
@ -126,6 +126,8 @@ class TestMail(TestCase):
|
|||||||
|
|
||||||
self.reset_bogus_mailbox()
|
self.reset_bogus_mailbox()
|
||||||
|
|
||||||
|
self.mail_account_handler = MailAccountHandler()
|
||||||
|
|
||||||
def reset_bogus_mailbox(self):
|
def reset_bogus_mailbox(self):
|
||||||
self.bogus_mailbox.messages = []
|
self.bogus_mailbox.messages = []
|
||||||
self.bogus_mailbox.messages_spam = []
|
self.bogus_mailbox.messages_spam = []
|
||||||
@ -182,6 +184,7 @@ class TestMail(TestCase):
|
|||||||
def test_handle_message(self):
|
def test_handle_message(self):
|
||||||
message = namedtuple('MailMessage', [])
|
message = namedtuple('MailMessage', [])
|
||||||
message.subject = "the message title"
|
message.subject = "the message title"
|
||||||
|
message.from_ = "Myself"
|
||||||
|
|
||||||
att = namedtuple('Attachment', [])
|
att = namedtuple('Attachment', [])
|
||||||
att.filename = "test1.pdf"
|
att.filename = "test1.pdf"
|
||||||
@ -200,9 +203,10 @@ class TestMail(TestCase):
|
|||||||
|
|
||||||
message.attachments = [att, att2, att3]
|
message.attachments = [att, att2, att3]
|
||||||
|
|
||||||
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME)
|
account = MailAccount()
|
||||||
|
rule = MailRule(assign_title_from=MailRule.TITLE_FROM_FILENAME, account=account)
|
||||||
|
|
||||||
result = handle_message(message, rule)
|
result = self.mail_account_handler.handle_message(message, rule)
|
||||||
|
|
||||||
self.assertEqual(result, 2)
|
self.assertEqual(result, 2)
|
||||||
|
|
||||||
@ -224,7 +228,7 @@ class TestMail(TestCase):
|
|||||||
message.attachments = []
|
message.attachments = []
|
||||||
rule = MailRule()
|
rule = MailRule()
|
||||||
|
|
||||||
result = handle_message(message, rule)
|
result = self.mail_account_handler.handle_message(message, rule)
|
||||||
|
|
||||||
self.assertFalse(m.called)
|
self.assertFalse(m.called)
|
||||||
self.assertEqual(result, 0)
|
self.assertEqual(result, 0)
|
||||||
@ -235,11 +239,13 @@ class TestMail(TestCase):
|
|||||||
|
|
||||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MARK_READ)
|
||||||
|
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
self.assertEqual(self.async_task.call_count, 0)
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
|
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 2)
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
self.assertEqual(self.async_task.call_count, 2)
|
self.assertEqual(self.async_task.call_count, 2)
|
||||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
|
self.assertEqual(len(self.bogus_mailbox.fetch("UNSEEN", False)), 0)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
|
|
||||||
def test_handle_mail_account_delete(self):
|
def test_handle_mail_account_delete(self):
|
||||||
|
|
||||||
@ -249,7 +255,7 @@ class TestMail(TestCase):
|
|||||||
|
|
||||||
self.assertEqual(self.async_task.call_count, 0)
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
self.assertEqual(self.async_task.call_count, 2)
|
self.assertEqual(self.async_task.call_count, 2)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||||
|
|
||||||
@ -258,11 +264,13 @@ class TestMail(TestCase):
|
|||||||
|
|
||||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_FLAG, filter_subject="Invoice")
|
||||||
|
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
self.assertEqual(self.async_task.call_count, 0)
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
|
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
self.assertEqual(self.async_task.call_count, 1)
|
self.assertEqual(self.async_task.call_count, 1)
|
||||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
|
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 1)
|
||||||
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
|
|
||||||
def test_handle_mail_account_move(self):
|
def test_handle_mail_account_move(self):
|
||||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="secret")
|
||||||
@ -272,7 +280,7 @@ class TestMail(TestCase):
|
|||||||
self.assertEqual(self.async_task.call_count, 0)
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
|
self.assertEqual(len(self.bogus_mailbox.messages_spam), 0)
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
self.assertEqual(self.async_task.call_count, 1)
|
self.assertEqual(self.async_task.call_count, 1)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||||
@ -281,7 +289,7 @@ class TestMail(TestCase):
|
|||||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
|
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
except MailError as e:
|
except MailError as e:
|
||||||
self.assertTrue(str(e).startswith("Error while authenticating account"))
|
self.assertTrue(str(e).startswith("Error while authenticating account"))
|
||||||
else:
|
else:
|
||||||
@ -291,7 +299,7 @@ class TestMail(TestCase):
|
|||||||
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
|
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
except MailError as e:
|
except MailError as e:
|
||||||
self.assertTrue("uuuh does not exist" in str(e))
|
self.assertTrue("uuuh does not exist" in str(e))
|
||||||
else:
|
else:
|
||||||
@ -302,7 +310,7 @@ class TestMail(TestCase):
|
|||||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
|
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
except MailError as e:
|
except MailError as e:
|
||||||
self.assertTrue("Error while processing post-consume actions" in str(e))
|
self.assertTrue("Error while processing post-consume actions" in str(e))
|
||||||
else:
|
else:
|
||||||
@ -316,7 +324,7 @@ class TestMail(TestCase):
|
|||||||
self.assertEqual(self.async_task.call_count, 0)
|
self.assertEqual(self.async_task.call_count, 0)
|
||||||
|
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||||
self.assertEqual(self.async_task.call_count, 1)
|
self.assertEqual(self.async_task.call_count, 1)
|
||||||
|
|
||||||
@ -326,7 +334,7 @@ class TestMail(TestCase):
|
|||||||
rule.filter_body = "electronic"
|
rule.filter_body = "electronic"
|
||||||
rule.save()
|
rule.save()
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||||
self.assertEqual(self.async_task.call_count, 2)
|
self.assertEqual(self.async_task.call_count, 2)
|
||||||
|
|
||||||
@ -336,7 +344,7 @@ class TestMail(TestCase):
|
|||||||
rule.filter_body = None
|
rule.filter_body = None
|
||||||
rule.save()
|
rule.save()
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||||
self.assertEqual(self.async_task.call_count, 4)
|
self.assertEqual(self.async_task.call_count, 4)
|
||||||
|
|
||||||
@ -347,6 +355,6 @@ class TestMail(TestCase):
|
|||||||
rule.filter_subject = "Invoice"
|
rule.filter_subject = "Invoice"
|
||||||
rule.save()
|
rule.save()
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||||
handle_mail_account(account)
|
self.mail_account_handler.handle_mail_account(account)
|
||||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||||
self.assertEqual(self.async_task.call_count, 5)
|
self.assertEqual(self.async_task.call_count, 5)
|
||||||
|
@ -86,7 +86,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
if not settings.OCR_ALWAYS and self._is_ocred():
|
if not settings.OCR_ALWAYS and self._is_ocred():
|
||||||
self.log("info", "Skipping OCR, using Text from PDF")
|
self.log("debug", "Skipping OCR, using Text from PDF")
|
||||||
self._text = get_text_from_pdf(self.document_path)
|
self._text = get_text_from_pdf(self.document_path)
|
||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
@ -98,7 +98,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
try:
|
try:
|
||||||
|
|
||||||
sample_page_index = int(len(images) / 2)
|
sample_page_index = int(len(images) / 2)
|
||||||
self.log("info", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
|
self.log("debug", "Attempting language detection on page {} of {}...".format(sample_page_index + 1, len(images)))
|
||||||
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
|
sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0]
|
||||||
guessed_language = self._guess_language(sample_page_text)
|
guessed_language = self._guess_language(sample_page_text)
|
||||||
|
|
||||||
@ -107,7 +107,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||||
|
|
||||||
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
||||||
self.log("info", "Detected language: {} (default language)".format(guessed_language))
|
self.log("debug", "Detected language: {} (default language)".format(guessed_language))
|
||||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||||
|
|
||||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
|
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
|
||||||
@ -115,10 +115,10 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
ocr_pages = self._complete_ocr_default_language(images, sample_page_index, sample_page_text)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.log("info", "Detected language: {}".format(guessed_language))
|
self.log("debug", "Detected language: {}".format(guessed_language))
|
||||||
ocr_pages = self._ocr(images, ISO639[guessed_language])
|
ocr_pages = self._ocr(images, ISO639[guessed_language])
|
||||||
|
|
||||||
self.log("info", "OCR completed.")
|
self.log("debug", "OCR completed.")
|
||||||
self._text = strip_excess_whitespace(" ".join(ocr_pages))
|
self._text = strip_excess_whitespace(" ".join(ocr_pages))
|
||||||
return self._text
|
return self._text
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
Greyscale images are easier for Tesseract to OCR
|
Greyscale images are easier for Tesseract to OCR
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self.log("info", "Converting document {} into greyscale images...".format(self.document_path))
|
self.log("debug", "Converting document {} into greyscale images...".format(self.document_path))
|
||||||
|
|
||||||
# Convert PDF to multiple PNMs
|
# Convert PDF to multiple PNMs
|
||||||
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||||
@ -148,7 +148,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
if f.endswith(".pnm"):
|
if f.endswith(".pnm"):
|
||||||
pnms.append(os.path.join(self.tempdir, f))
|
pnms.append(os.path.join(self.tempdir, f))
|
||||||
|
|
||||||
self.log("info", "Running unpaper on {} pages...".format(len(pnms)))
|
self.log("debug", "Running unpaper on {} pages...".format(len(pnms)))
|
||||||
|
|
||||||
# Run unpaper in parallel on converted images
|
# Run unpaper in parallel on converted images
|
||||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||||
@ -161,11 +161,11 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
guess = langdetect.detect(text)
|
guess = langdetect.detect(text)
|
||||||
return guess
|
return guess
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log('debug', "Language detection failed with: {}".format(e))
|
self.log('warning', "Language detection failed with: {}".format(e))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _ocr(self, imgs, lang):
|
def _ocr(self, imgs, lang):
|
||||||
self.log("info", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
|
self.log("debug", "Performing OCR on {} page(s) with language {}".format(len(imgs), lang))
|
||||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||||
return r
|
return r
|
||||||
@ -180,7 +180,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
images_copy = list(images)
|
images_copy = list(images)
|
||||||
del images_copy[sample_page_index]
|
del images_copy[sample_page_index]
|
||||||
if images_copy:
|
if images_copy:
|
||||||
self.log('info', 'Continuing ocr with default language.')
|
self.log('debug', 'Continuing ocr with default language.')
|
||||||
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
|
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
|
||||||
ocr_pages.insert(sample_page_index, sample_page)
|
ocr_pages.insert(sample_page_index, sample_page)
|
||||||
return ocr_pages
|
return ocr_pages
|
||||||
|
Loading…
x
Reference in New Issue
Block a user