From d132eba1431fd2c8a9dd57904cb019c26dbcc2e2 Mon Sep 17 00:00:00 2001 From: phail Date: Sun, 20 Nov 2022 12:48:03 +0100 Subject: [PATCH] optimize regex --- src/paperless_mail/parsers.py | 5 ++--- src/paperless_mail/tests/test_parsers.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index c4ecaf861..902619fd7 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -105,9 +105,8 @@ class MailDocumentParser(DocumentParser): def parse(self, document_path, mime_type, file_name=None): def strip_text(text: str): - text = re.sub("\t", " ", text) - text = re.sub(" +", " ", text) - text = re.sub("(\n *)+", "\n", text) + text = re.sub(r"\s+", " ", text) + text = re.sub(r"(\n *)+", "\n", text) return text.strip() mail = self.get_parsed(document_path) diff --git a/src/paperless_mail/tests/test_parsers.py b/src/paperless_mail/tests/test_parsers.py index 1a348b472..5cd614197 100644 --- a/src/paperless_mail/tests/test_parsers.py +++ b/src/paperless_mail/tests/test_parsers.py @@ -227,7 +227,7 @@ class TestParser(TestCase): @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") def test_parse_html_eml(self, n, mock_tika_parse: mock.MagicMock): # Validate parsing returns the expected results - text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name \n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return" + text_expected = "Some Text and an embedded image.\n\nSubject: HTML Message\n\nFrom: Name \n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return" mock_tika_parse.return_value = "tika return" self.parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822")