From 567e89d1c7a8fea4f66b717d3ba4cb9680fba0da Mon Sep 17 00:00:00 2001 From: phail Date: Sat, 22 Oct 2022 02:25:23 +0200 Subject: [PATCH] test for broken eml, add test_generate_pdf --- src/paperless_mail/parsers.py | 17 ++--- src/paperless_mail/tests/samples/broken.eml | 1 + src/paperless_mail/tests/samples/html.eml | 20 ++++-- src/paperless_mail/tests/test_eml.py | 74 +++++++++++++-------- 4 files changed, 68 insertions(+), 44 deletions(-) create mode 100644 src/paperless_mail/tests/samples/broken.eml diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index ccfdfe3a3..654372666 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -35,6 +35,11 @@ class MailDocumentParser(DocumentParser): raise ParseError( f"Could not parse {document_path}: {err}", ) + if not self._parsed.from_values: + self._parsed = None + raise ParseError( + f"Could not parse {document_path}: Missing 'from'", + ) return self._parsed @@ -185,7 +190,8 @@ class MailDocumentParser(DocumentParser): return pdf_path - def mail_to_html(self, mail): + @staticmethod + def mail_to_html(mail): data = {} def clean_html(text: str): @@ -230,15 +236,6 @@ class MailDocumentParser(DocumentParser): if data["attachments"] != "": data["attachments_label"] = "Attachments" - if len(mail.attachments) >= 1: - att = [] - for a in mail.attachments: - if a.size >= 1024 * 600: - att.append(f"{a.filename} ({(a.size / 1024 / 1024):.2f} MiB)") - else: - att.append(f"{a.filename} ({(a.size / 1024):.2f} KiB)") - self.text += f"Attachments: {', '.join(att)}\n\n" - data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M")) data["content"] = clean_html(mail.text.strip()) diff --git a/src/paperless_mail/tests/samples/broken.eml b/src/paperless_mail/tests/samples/broken.eml new file mode 100644 index 000000000..3e03caf0a --- /dev/null +++ b/src/paperless_mail/tests/samples/broken.eml @@ -0,0 +1 @@ +This is not a valid eml. diff --git a/src/paperless_mail/tests/samples/html.eml b/src/paperless_mail/tests/samples/html.eml index d6ee7c350..09af2e1e9 100644 --- a/src/paperless_mail/tests/samples/html.eml +++ b/src/paperless_mail/tests/samples/html.eml @@ -47,12 +47,20 @@ Content-Transfer-Encoding: 7bit - -

Some Text

-

-

and an embedded image.
-

- + +

Some Text

+

+ Has to be rewritten to work.. + This image should not be shown. +

+ +

and an embedded image.
+

+

Paragraph unchanged.

+ + --------------fyEsKoz3fdzPxAaSslESHcHz Content-Type: image/png; name="IntM6gnXFm00FEV5.png" diff --git a/src/paperless_mail/tests/test_eml.py b/src/paperless_mail/tests/test_eml.py index da868ef56..d922167f1 100644 --- a/src/paperless_mail/tests/test_eml.py +++ b/src/paperless_mail/tests/test_eml.py @@ -20,7 +20,11 @@ class TestParser(TestCase): # Check if exception is raised when parsing fails. with pytest.raises(ParseError): - parser.get_parsed(os.path.join(os.path.join(self.SAMPLE_FILES, "na"))) + parser.get_parsed(os.path.join(self.SAMPLE_FILES, "na")) + + # Check if exception is raised when the mail is faulty. + with pytest.raises(ParseError): + parser.get_parsed(os.path.join(self.SAMPLE_FILES, "broken.eml")) # Parse Test file and check relevant content parsed1 = parser.get_parsed(os.path.join(self.SAMPLE_FILES, "simple_text.eml")) @@ -53,7 +57,8 @@ class TestParser(TestCase): sha256.update(data) return sha256.hexdigest() - def test_get_thumbnail(self): + @mock.patch("documents.loggers.LoggingMixin.log") # Disable log output + def test_get_thumbnail(self, m): parser = MailDocumentParser(None) thumb = parser.get_thumbnail( os.path.join(self.SAMPLE_FILES, "simple_text.eml"), @@ -221,7 +226,7 @@ class TestParser(TestCase): # Validate parsing returns the expected results parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822") - text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name \n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB)\n\nHTML content: Some Text\nand an embedded image.Attachments: IntM6gnXFm00FEV5.png (6.89 KiB)\n\n" + text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name \n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB)\n\nHTML content: Some Text\nand an embedded image.\nParagraph unchanged." self.assertEqual(text_expected, parser.text) self.assertEqual( datetime.datetime( @@ -235,33 +240,10 @@ class TestParser(TestCase): ), parser.date, ) + + # Just check if file exists, the unittest for generate_pdf() goes deeper. self.assertTrue(os.path.isfile(parser.archive_path)) - converted = os.path.join(parser.tempdir, "converted.webp") - run_convert( - density=300, - scale="500x5000>", - alpha="remove", - strip=True, - trim=False, - auto_orient=True, - input_file=f"{parser.archive_path}", # Do net define an index to convert all pages. - output_file=converted, - logging_group=None, - ) - self.assertTrue(os.path.isfile(converted)) - thumb_hash = self.hashfile(converted) - - # The created pdf is not reproducible. But the converted image should always look the same. - expected_hash = ( - "174f9c81f9aeda63b64375fa2fe675fd542677c1ba7a32fc19e09ffc4d461e12" - ) - self.assertEqual( - thumb_hash, - expected_hash, - "PDF looks different.", - ) - @mock.patch("documents.loggers.LoggingMixin.log") # Disable log output def test_tika_parse(self, m): html = '

Some Text

' @@ -285,6 +267,42 @@ class TestParser(TestCase): parsed = parser.tika_parse(html) self.assertEqual(expected_text, parsed) + @mock.patch("documents.loggers.LoggingMixin.log") # Disable log output + def test_generate_pdf(self, m): + parser = MailDocumentParser(None) + + # Check if exception is raised when the mail can not be parsed. + with pytest.raises(ParseError): + parser.generate_pdf(os.path.join(self.SAMPLE_FILES, "broken.eml")) + + pdf_path = parser.generate_pdf(os.path.join(self.SAMPLE_FILES, "html.eml")) + self.assertTrue(os.path.isfile(pdf_path)) + + converted = os.path.join(parser.tempdir, "test_generate_pdf.webp") + run_convert( + density=300, + scale="500x5000>", + alpha="remove", + strip=True, + trim=False, + auto_orient=True, + input_file=f"{pdf_path}", # Do net define an index to convert all pages. + output_file=converted, + logging_group=None, + ) + self.assertTrue(os.path.isfile(converted)) + thumb_hash = self.hashfile(converted) + + # The created pdf is not reproducible. But the converted image should always look the same. + expected_hash = ( + "23468c4597d63bbefd38825e27c7f05ac666573fc35447d9ddf1784c9c31c6ea" + ) + self.assertEqual( + thumb_hash, + expected_hash, + f"PDF looks different. Check if {converted} looks weird.", + ) + def test_transform_inline_html(self): class MailAttachmentMock: def __init__(self, payload, content_id):