mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
add unittest for transform_inline_html
This commit is contained in:
parent
fda844f64c
commit
e384bd78c5
@ -296,22 +296,26 @@ class MailDocumentParser(DocumentParser):
|
||||
|
||||
return response.content
|
||||
|
||||
def transform_inline_html(self, orig_html, attachments):
|
||||
@staticmethod
|
||||
def transform_inline_html(html, attachments):
|
||||
def clean_html_script(text: str):
|
||||
text = text.replace("<script", "<div hidden ")
|
||||
text = text.replace("</script", "</div")
|
||||
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
|
||||
text = compiled_open.sub("<div hidden ", text)
|
||||
|
||||
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
|
||||
text = compiled_close.sub("</div", text)
|
||||
return text
|
||||
|
||||
orig_html = clean_html_script(orig_html)
|
||||
html_clean = clean_html_script(html)
|
||||
files = []
|
||||
|
||||
for a in attachments:
|
||||
name_cid = "cid:" + a.content_id
|
||||
name_clean = "".join(e for e in name_cid if e.isalnum())
|
||||
files.append((name_clean, BytesIO(a.payload)))
|
||||
orig_html = orig_html.replace(name_cid, name_clean)
|
||||
html_clean = html_clean.replace(name_cid, name_clean)
|
||||
|
||||
files.append(("index.html", StringIO(orig_html)))
|
||||
files.append(("index.html", StringIO(html_clean)))
|
||||
|
||||
return files
|
||||
|
||||
|
15
src/paperless_mail/tests/samples/sample.html
Normal file
15
src/paperless_mail/tests/samples/sample.html
Normal file
@ -0,0 +1,15 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||
</head>
|
||||
<body>
|
||||
<p>Some Text</p>
|
||||
<p><img src="cid:part1.pNdUSz0s.D3NqVtPg@example.de" alt=""></p>
|
||||
<p>and an embedded image.<br>
|
||||
</p>
|
||||
<p id="changeme">Paragraph unchanged.</p>
|
||||
<scRipt>
|
||||
document.getElementById("changeme").innerHTML = "Paragraph changed via Java Script.";
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
BIN
src/paperless_mail/tests/samples/sample.png
Normal file
BIN
src/paperless_mail/tests/samples/sample.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 6.9 KiB |
@ -282,3 +282,27 @@ class TestParser(TestCase):
|
||||
# Check successful parsing
|
||||
parsed = parser.tika_parse(html)
|
||||
self.assertEqual(expected_text, parsed)
|
||||
|
||||
def test_transform_inline_html(self):
|
||||
class MailAttachmentMock:
|
||||
def __init__(self, payload, content_id):
|
||||
self.payload = payload
|
||||
self.content_id = content_id
|
||||
|
||||
parser = MailDocumentParser(None)
|
||||
|
||||
result = None
|
||||
|
||||
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
|
||||
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
|
||||
html = html_file.read()
|
||||
png = png_file.read()
|
||||
attachments = [
|
||||
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
|
||||
]
|
||||
result = parser.transform_inline_html(html, attachments)
|
||||
|
||||
resulting_html = result[-1][1].read()
|
||||
self.assertTrue(result[-1][0] == "index.html")
|
||||
self.assertTrue(result[0][0] in resulting_html)
|
||||
self.assertFalse("<script" in resulting_html.lower())
|
||||
|
Loading…
x
Reference in New Issue
Block a user