mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
optimize regex
This commit is contained in:
parent
073c3c8fed
commit
d132eba143
src/paperless_mail
@ -105,9 +105,8 @@ class MailDocumentParser(DocumentParser):
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
def strip_text(text: str):
|
||||
text = re.sub("\t", " ", text)
|
||||
text = re.sub(" +", " ", text)
|
||||
text = re.sub("(\n *)+", "\n", text)
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
text = re.sub(r"(\n *)+", "\n", text)
|
||||
return text.strip()
|
||||
|
||||
mail = self.get_parsed(document_path)
|
||||
|
@ -227,7 +227,7 @@ class TestParser(TestCase):
|
||||
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
|
||||
def test_parse_html_eml(self, n, mock_tika_parse: mock.MagicMock):
|
||||
# Validate parsing returns the expected results
|
||||
text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return"
|
||||
text_expected = "Some Text and an embedded image.\n\nSubject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return"
|
||||
mock_tika_parse.return_value = "tika return"
|
||||
|
||||
self.parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822")
|
||||
|
Loading…
x
Reference in New Issue
Block a user