optimize regex

This commit is contained in:
phail 2022-11-20 12:48:03 +01:00
parent 073c3c8fed
commit d132eba143
2 changed files with 3 additions and 4 deletions
src/paperless_mail

@ -105,9 +105,8 @@ class MailDocumentParser(DocumentParser):
def parse(self, document_path, mime_type, file_name=None):
def strip_text(text: str):
text = re.sub("\t", " ", text)
text = re.sub(" +", " ", text)
text = re.sub("(\n *)+", "\n", text)
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
mail = self.get_parsed(document_path)

@ -227,7 +227,7 @@ class TestParser(TestCase):
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_parse_html_eml(self, n, mock_tika_parse: mock.MagicMock):
# Validate parsing returns the expected results
text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return"
text_expected = "Some Text and an embedded image.\n\nSubject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return"
mock_tika_parse.return_value = "tika return"
self.parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822")