split handle_message function

This commit is contained in:
phail 2022-11-20 16:09:46 +01:00
parent 1fa735eb23
commit df101f5e7a

View File

@ -351,11 +351,15 @@ class MailAccountHandler(LoggingMixin):
return total_processed_files return total_processed_files
def handle_message(self, message, rule: MailRule) -> int: def handle_message(self, message, rule: MailRule) -> int:
processed_elements = 0
# Skip Message handling when only attachments are to be processed but
# message doesn't have any.
if ( if (
not message.attachments not message.attachments
and rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY and rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY
): ):
return 0 return processed_elements
self.log( self.log(
"debug", "debug",
@ -368,130 +372,162 @@ class MailAccountHandler(LoggingMixin):
tag_ids = [tag.id for tag in rule.assign_tags.all()] tag_ids = [tag.id for tag in rule.assign_tags.all()]
doc_type = rule.assign_document_type doc_type = rule.assign_document_type
processed_attachments = 0
if ( if (
rule.consumption_scope == MailRule.ConsumptionScope.EML_ONLY rule.consumption_scope == MailRule.ConsumptionScope.EML_ONLY
or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING
): ):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True) processed_elements += self.process_eml(
_, temp_filename = tempfile.mkstemp( message,
prefix="paperless-mail-", rule,
dir=settings.SCRATCH_DIR, correspondent,
suffix=".eml", tag_ids,
doc_type,
) )
with open(temp_filename, "wb") as f:
# Move "From"-header to beginning of file
# TODO: This ugly workaround is needed because the parser is
# chosen only by the mime_type detected via magic
# (see documents/consumer.py "mime_type = magic.from_file")
# Unfortunately magic sometimes fails to detect the mime
# type of .eml files correctly as message/rfc822 and instead
# detects text/plain.
# This also effects direct file consumption of .eml files
# which are not treated with this workaround.
from_element = None
for i, header in enumerate(message.obj._headers):
if header[0] == "From":
from_element = i
if from_element:
new_headers = [message.obj._headers.pop(from_element)]
new_headers += message.obj._headers
message.obj._headers = new_headers
f.write(message.obj.as_bytes())
self.log(
"info",
f"Rule {rule}: "
f"Consuming eml from mail "
f"{message.subject} from {message.from_}",
)
consume_file.delay(
path=temp_filename,
override_filename=pathvalidate.sanitize_filename(
message.subject + ".eml",
),
override_title=message.subject,
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=tag_ids,
)
processed_attachments += 1
if ( if (
rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY
or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING
): ):
for att in message.attachments: processed_elements += self.process_attachments(
message,
rule,
correspondent,
tag_ids,
doc_type,
)
if ( return processed_elements
not att.content_disposition == "attachment"
and rule.attachment_type def process_attachments(
== MailRule.AttachmentProcessing.ATTACHMENTS_ONLY self,
message: MailMessage,
rule: MailRule,
correspondent,
tag_ids,
doc_type,
):
processed_attachments = 0
for att in message.attachments:
if (
not att.content_disposition == "attachment"
and rule.attachment_type
== MailRule.AttachmentProcessing.ATTACHMENTS_ONLY
):
self.log(
"debug",
f"Rule {rule}: "
f"Skipping attachment {att.filename} "
f"with content disposition {att.content_disposition}",
)
continue
if rule.filter_attachment_filename:
# Force the filename and pattern to the lowercase
# as this is system dependent otherwise
if not fnmatch(
att.filename.lower(),
rule.filter_attachment_filename.lower(),
): ):
self.log(
"debug",
f"Rule {rule}: "
f"Skipping attachment {att.filename} "
f"with content disposition {att.content_disposition}",
)
continue continue
if rule.filter_attachment_filename: title = self.get_title(message, att, rule)
# Force the filename and pattern to the lowercase
# as this is system dependent otherwise
if not fnmatch(
att.filename.lower(),
rule.filter_attachment_filename.lower(),
):
continue
title = self.get_title(message, att, rule) # don't trust the content type of the attachment. Could be
# generic application/octet-stream.
mime_type = magic.from_buffer(att.payload, mime=True)
# don't trust the content type of the attachment. Could be if is_mime_type_supported(mime_type):
# generic application/octet-stream.
mime_type = magic.from_buffer(att.payload, mime=True)
if is_mime_type_supported(mime_type): os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(
prefix="paperless-mail-",
dir=settings.SCRATCH_DIR,
)
with open(temp_filename, "wb") as f:
f.write(att.payload)
os.makedirs(settings.SCRATCH_DIR, exist_ok=True) self.log(
_, temp_filename = tempfile.mkstemp( "info",
prefix="paperless-mail-", f"Rule {rule}: "
dir=settings.SCRATCH_DIR, f"Consuming attachment {att.filename} from mail "
) f"{message.subject} from {message.from_}",
with open(temp_filename, "wb") as f: )
f.write(att.payload)
self.log( consume_file.delay(
"info", path=temp_filename,
f"Rule {rule}: " override_filename=pathvalidate.sanitize_filename(
f"Consuming attachment {att.filename} from mail " att.filename,
f"{message.subject} from {message.from_}", ),
) override_title=title,
override_correspondent_id=correspondent.id
if correspondent
else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=tag_ids,
)
consume_file.delay( processed_attachments += 1
path=temp_filename, else:
override_filename=pathvalidate.sanitize_filename( self.log(
att.filename, "debug",
), f"Rule {rule}: "
override_title=title, f"Skipping attachment {att.filename} "
override_correspondent_id=correspondent.id f"since guessed mime type {mime_type} is not supported "
if correspondent f"by paperless",
else None, )
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=tag_ids,
)
processed_attachments += 1 def process_eml(
else: self,
self.log( message: MailMessage,
"debug", rule: MailRule,
f"Rule {rule}: " correspondent,
f"Skipping attachment {att.filename} " tag_ids,
f"since guessed mime type {mime_type} is not supported " doc_type,
f"by paperless", ):
) os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(
prefix="paperless-mail-",
dir=settings.SCRATCH_DIR,
suffix=".eml",
)
with open(temp_filename, "wb") as f:
# Move "From"-header to beginning of file
# TODO: This ugly workaround is needed because the parser is
# chosen only by the mime_type detected via magic
# (see documents/consumer.py "mime_type = magic.from_file")
# Unfortunately magic sometimes fails to detect the mime
# type of .eml files correctly as message/rfc822 and instead
# detects text/plain.
# This also effects direct file consumption of .eml files
# which are not treated with this workaround.
from_element = None
for i, header in enumerate(message.obj._headers):
if header[0] == "From":
from_element = i
if from_element:
new_headers = [message.obj._headers.pop(from_element)]
new_headers += message.obj._headers
message.obj._headers = new_headers
return processed_attachments f.write(message.obj.as_bytes())
self.log(
"info",
f"Rule {rule}: "
f"Consuming eml from mail "
f"{message.subject} from {message.from_}",
)
consume_file.delay(
path=temp_filename,
override_filename=pathvalidate.sanitize_filename(
message.subject + ".eml",
),
override_title=message.subject,
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=tag_ids,
)
processed_elements = 1
return processed_elements