split handle_message function

2026-01-26 22:49:01 -06:00 · 2022-11-20 16:09:46 +01:00
parent 1fa735eb23
commit df101f5e7a
1 changed files with 142 additions and 106 deletions
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -351,11 +351,15 @@ class MailAccountHandler(LoggingMixin):
        return total_processed_files

    def handle_message(self, message, rule: MailRule) -> int:
+        processed_elements = 0
+
+        # Skip Message handling when only attachments are to be processed but
+        # message doesn't have any.
        if (
            not message.attachments
            and rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY
        ):
-            return 0
+            return processed_elements

        self.log(
            "debug",
@@ -368,130 +372,162 @@ class MailAccountHandler(LoggingMixin):
        tag_ids = [tag.id for tag in rule.assign_tags.all()]
        doc_type = rule.assign_document_type

-        processed_attachments = 0
-
        if (
            rule.consumption_scope == MailRule.ConsumptionScope.EML_ONLY
            or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING
        ):
-            os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-            _, temp_filename = tempfile.mkstemp(
-                prefix="paperless-mail-",
-                dir=settings.SCRATCH_DIR,
-                suffix=".eml",
+            processed_elements += self.process_eml(
+                message,
+                rule,
+                correspondent,
+                tag_ids,
+                doc_type,
            )
-            with open(temp_filename, "wb") as f:
-                # Move "From"-header to beginning of file
-                # TODO: This ugly workaround is needed because the parser is
-                #   chosen only by the mime_type detected via magic
-                #   (see documents/consumer.py "mime_type = magic.from_file")
-                #   Unfortunately magic sometimes fails to detect the mime
-                #   type of .eml files correctly as message/rfc822 and instead
-                #   detects text/plain.
-                #   This also effects direct file consumption of .eml files
-                #   which are not treated with this workaround.
-                from_element = None
-                for i, header in enumerate(message.obj._headers):
-                    if header[0] == "From":
-                        from_element = i
-                if from_element:
-                    new_headers = [message.obj._headers.pop(from_element)]
-                    new_headers += message.obj._headers
-                    message.obj._headers = new_headers
-
-                f.write(message.obj.as_bytes())
-
-            self.log(
-                "info",
-                f"Rule {rule}: "
-                f"Consuming eml from mail "
-                f"{message.subject} from {message.from_}",
-            )
-
-            consume_file.delay(
-                path=temp_filename,
-                override_filename=pathvalidate.sanitize_filename(
-                    message.subject + ".eml",
-                ),
-                override_title=message.subject,
-                override_correspondent_id=correspondent.id if correspondent else None,
-                override_document_type_id=doc_type.id if doc_type else None,
-                override_tag_ids=tag_ids,
-            )
-            processed_attachments += 1

        if (
            rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY
            or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING
        ):
-            for att in message.attachments:
+            processed_elements += self.process_attachments(
+                message,
+                rule,
+                correspondent,
+                tag_ids,
+                doc_type,
+            )

-                if (
-                    not att.content_disposition == "attachment"
-                    and rule.attachment_type
-                    == MailRule.AttachmentProcessing.ATTACHMENTS_ONLY
+        return processed_elements
+
+    def process_attachments(
+        self,
+        message: MailMessage,
+        rule: MailRule,
+        correspondent,
+        tag_ids,
+        doc_type,
+    ):
+        processed_attachments = 0
+        for att in message.attachments:
+
+            if (
+                not att.content_disposition == "attachment"
+                and rule.attachment_type
+                == MailRule.AttachmentProcessing.ATTACHMENTS_ONLY
+            ):
+                self.log(
+                    "debug",
+                    f"Rule {rule}: "
+                    f"Skipping attachment {att.filename} "
+                    f"with content disposition {att.content_disposition}",
+                )
+                continue
+
+            if rule.filter_attachment_filename:
+                # Force the filename and pattern to the lowercase
+                # as this is system dependent otherwise
+                if not fnmatch(
+                    att.filename.lower(),
+                    rule.filter_attachment_filename.lower(),
                ):
-                    self.log(
-                        "debug",
-                        f"Rule {rule}: "
-                        f"Skipping attachment {att.filename} "
-                        f"with content disposition {att.content_disposition}",
-                    )
                    continue

-                if rule.filter_attachment_filename:
-                    # Force the filename and pattern to the lowercase
-                    # as this is system dependent otherwise
-                    if not fnmatch(
-                        att.filename.lower(),
-                        rule.filter_attachment_filename.lower(),
-                    ):
-                        continue
+            title = self.get_title(message, att, rule)

-                title = self.get_title(message, att, rule)
+            # don't trust the content type of the attachment. Could be
+            # generic application/octet-stream.
+            mime_type = magic.from_buffer(att.payload, mime=True)

-                # don't trust the content type of the attachment. Could be
-                # generic application/octet-stream.
-                mime_type = magic.from_buffer(att.payload, mime=True)
+            if is_mime_type_supported(mime_type):

-                if is_mime_type_supported(mime_type):
+                os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
+                _, temp_filename = tempfile.mkstemp(
+                    prefix="paperless-mail-",
+                    dir=settings.SCRATCH_DIR,
+                )
+                with open(temp_filename, "wb") as f:
+                    f.write(att.payload)

-                    os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
-                    _, temp_filename = tempfile.mkstemp(
-                        prefix="paperless-mail-",
-                        dir=settings.SCRATCH_DIR,
-                    )
-                    with open(temp_filename, "wb") as f:
-                        f.write(att.payload)
+                self.log(
+                    "info",
+                    f"Rule {rule}: "
+                    f"Consuming attachment {att.filename} from mail "
+                    f"{message.subject} from {message.from_}",
+                )

-                    self.log(
-                        "info",
-                        f"Rule {rule}: "
-                        f"Consuming attachment {att.filename} from mail "
-                        f"{message.subject} from {message.from_}",
-                    )
+                consume_file.delay(
+                    path=temp_filename,
+                    override_filename=pathvalidate.sanitize_filename(
+                        att.filename,
+                    ),
+                    override_title=title,
+                    override_correspondent_id=correspondent.id
+                    if correspondent
+                    else None,
+                    override_document_type_id=doc_type.id if doc_type else None,
+                    override_tag_ids=tag_ids,
+                )

-                    consume_file.delay(
-                        path=temp_filename,
-                        override_filename=pathvalidate.sanitize_filename(
-                            att.filename,
-                        ),
-                        override_title=title,
-                        override_correspondent_id=correspondent.id
-                        if correspondent
-                        else None,
-                        override_document_type_id=doc_type.id if doc_type else None,
-                        override_tag_ids=tag_ids,
-                    )
+                processed_attachments += 1
+            else:
+                self.log(
+                    "debug",
+                    f"Rule {rule}: "
+                    f"Skipping attachment {att.filename} "
+                    f"since guessed mime type {mime_type} is not supported "
+                    f"by paperless",
+                )

-                    processed_attachments += 1
-                else:
-                    self.log(
-                        "debug",
-                        f"Rule {rule}: "
-                        f"Skipping attachment {att.filename} "
-                        f"since guessed mime type {mime_type} is not supported "
-                        f"by paperless",
-                    )
+    def process_eml(
+        self,
+        message: MailMessage,
+        rule: MailRule,
+        correspondent,
+        tag_ids,
+        doc_type,
+    ):
+        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
+        _, temp_filename = tempfile.mkstemp(
+            prefix="paperless-mail-",
+            dir=settings.SCRATCH_DIR,
+            suffix=".eml",
+        )
+        with open(temp_filename, "wb") as f:
+            # Move "From"-header to beginning of file
+            # TODO: This ugly workaround is needed because the parser is
+            #   chosen only by the mime_type detected via magic
+            #   (see documents/consumer.py "mime_type = magic.from_file")
+            #   Unfortunately magic sometimes fails to detect the mime
+            #   type of .eml files correctly as message/rfc822 and instead
+            #   detects text/plain.
+            #   This also effects direct file consumption of .eml files
+            #   which are not treated with this workaround.
+            from_element = None
+            for i, header in enumerate(message.obj._headers):
+                if header[0] == "From":
+                    from_element = i
+            if from_element:
+                new_headers = [message.obj._headers.pop(from_element)]
+                new_headers += message.obj._headers
+                message.obj._headers = new_headers

-        return processed_attachments
+            f.write(message.obj.as_bytes())
+
+        self.log(
+            "info",
+            f"Rule {rule}: "
+            f"Consuming eml from mail "
+            f"{message.subject} from {message.from_}",
+        )
+
+        consume_file.delay(
+            path=temp_filename,
+            override_filename=pathvalidate.sanitize_filename(
+                message.subject + ".eml",
+            ),
+            override_title=message.subject,
+            override_correspondent_id=correspondent.id if correspondent else None,
+            override_document_type_id=doc_type.id if doc_type else None,
+            override_tag_ids=tag_ids,
+        )
+        processed_elements = 1
+        return processed_elements