diff --git a/src/paperless_mail/tests/samples/html.eml.pdf b/src/paperless_mail/tests/samples/html.eml.pdf new file mode 100644 index 000000000..058988f66 Binary files /dev/null and b/src/paperless_mail/tests/samples/html.eml.pdf differ diff --git a/src/paperless_mail/tests/samples/html.eml.pdf.webp b/src/paperless_mail/tests/samples/html.eml.pdf.webp new file mode 100644 index 000000000..b4481efd9 Binary files /dev/null and b/src/paperless_mail/tests/samples/html.eml.pdf.webp differ diff --git a/src/paperless_mail/tests/samples/sample.html.pdf b/src/paperless_mail/tests/samples/sample.html.pdf new file mode 100644 index 000000000..24c2dbf87 Binary files /dev/null and b/src/paperless_mail/tests/samples/sample.html.pdf differ diff --git a/src/paperless_mail/tests/test_parsers.py b/src/paperless_mail/tests/test_parsers.py index 2bc0fbdad..3da54e364 100644 --- a/src/paperless_mail/tests/test_parsers.py +++ b/src/paperless_mail/tests/test_parsers.py @@ -6,7 +6,6 @@ import pytest from django.test import TestCase from documents.parsers import ParseError from paperless_mail.parsers import MailDocumentParser -from pdfminer.high_level import extract_text class TestParser(TestCase): @@ -217,13 +216,16 @@ class TestParser(TestCase): "message/rfc822", ) + @mock.patch("paperless_mail.parsers.MailDocumentParser.tika_parse") @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") @mock.patch("documents.loggers.LoggingMixin.log") # Disable log output - def test_parse_html_eml(self, m, n): + def test_parse_html_eml(self, m, n, mock_tika_parse: mock.MagicMock): # Validate parsing returns the expected results + text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name \n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (0.59 MiB)\n\nHTML content: tika return" + mock_tika_parse.return_value = "tika return" + self.parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822") - text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name \n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (0.59 MiB)\n\nHTML content: Some Text\nand an embedded image.\nParagraph unchanged." self.assertEqual(text_expected, self.parser.text) self.assertEqual( datetime.datetime( @@ -265,27 +267,30 @@ class TestParser(TestCase): # Just check if file exists, the unittest for generate_pdf() goes deeper. self.assertTrue(os.path.isfile(self.parser.archive_path)) + @mock.patch("paperless_mail.parsers.parser.from_buffer") @mock.patch("documents.loggers.LoggingMixin.log") # Disable log output - def test_tika_parse(self, m): + def test_tika_parse(self, m, mock_from_buffer: mock.MagicMock): html = '

Some Text

' expected_text = "Some Text" - - tika_server_original = self.parser.tika_server - - # Check if exception is raised when Tika cannot be reached. - with pytest.raises(ParseError): - self.parser.tika_server = "" - self.parser.tika_parse(html) + mock_from_buffer.return_value = {"content": expected_text} # Check unsuccessful parsing - self.parser.tika_server = tika_server_original - + mock_from_buffer.return_value = {"content": None} parsed = self.parser.tika_parse(None) self.assertEqual("", parsed) # Check successful parsing + mock_from_buffer.return_value = {"content": expected_text} parsed = self.parser.tika_parse(html) self.assertEqual(expected_text, parsed.strip()) + mock_from_buffer.assert_called_with(html, self.parser.tika_server) + + # Check ParseError + def my_side_effect(): + raise Exception("Test") + + mock_from_buffer.side_effect = my_side_effect + self.assertRaises(ParseError, self.parser.tika_parse, html) @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail") @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html") @@ -373,25 +378,31 @@ class TestParser(TestCase): retval = self.parser.generate_pdf_from_mail(mail) self.assertEqual(b"Content", retval) - mock_generate_pdf_from_mail.assert_called_once_with( - self.parser.get_parsed(None), - ) - mock_generate_pdf_from_html.assert_called_once_with( - self.parser.get_parsed(None).html, - self.parser.get_parsed(None).attachments, - ) + mock_mail_to_html.assert_called_once_with(mail) self.assertEqual( - self.parser.gotenberg_server + "/forms/pdfengines/merge", + self.parser.gotenberg_server + "/forms/chromium/convert/html", mock_post.call_args.args[0], ) self.assertEqual({}, mock_post.call_args.kwargs["headers"]) self.assertEqual( - b"Mail Return", - mock_post.call_args.kwargs["files"]["1_mail.pdf"][1].read(), + { + "marginTop": "0.1", + "marginBottom": "0.1", + "marginLeft": "0.1", + "marginRight": "0.1", + "paperWidth": "8.27", + "paperHeight": "11.7", + "scale": "1.0", + }, + mock_post.call_args.kwargs["data"], ) self.assertEqual( - b"HTML Return", - mock_post.call_args.kwargs["files"]["2_html.pdf"][1].read(), + "Testresponse", + mock_post.call_args.kwargs["files"]["html"][1], + ) + self.assertEqual( + "output.css", + mock_post.call_args.kwargs["files"]["css"][0], ) mock_response.raise_for_status.assert_called_once() diff --git a/src/paperless_mail/tests/test_parsers_live.py b/src/paperless_mail/tests/test_parsers_live.py index a0fa1f54d..653388300 100644 --- a/src/paperless_mail/tests/test_parsers_live.py +++ b/src/paperless_mail/tests/test_parsers_live.py @@ -59,6 +59,10 @@ class TestParserLive(TestCase): f"Created Thumbnail {thumb} differs from expected file {expected}", ) + @pytest.mark.skipif( + "TIKA_LIVE" not in os.environ, + reason="No tika server", + ) @mock.patch("documents.loggers.LoggingMixin.log") # Disable log output def test_tika_parse(self, m): html = '

Some Text

' @@ -108,6 +112,28 @@ class TestParserLive(TestCase): ) self.assertEqual(expected, extracted) + @pytest.mark.skipif( + "GOTENBERG_LIVE" not in os.environ, + reason="No gotenberg server", + ) + @mock.patch("documents.loggers.LoggingMixin.log") # Disable log output + def test_generate_pdf_from_mail_no_convert(self, m): + mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml")) + + pdf_path = os.path.join(self.parser.tempdir, "html.eml.pdf") + + with open(pdf_path, "wb") as file: + file.write(self.parser.generate_pdf_from_mail(mail)) + file.close() + + extracted = extract_text(pdf_path) + expected = extract_text(os.path.join(self.SAMPLE_FILES, "html.eml.pdf")) + self.assertEqual(expected, extracted) + + @pytest.mark.skipif( + "GOTENBERG_LIVE" not in os.environ, + reason="No gotenberg server", + ) # Only run if convert is available @pytest.mark.skipif( "PAPERLESS_TEST_SKIP_CONVERT" in os.environ, @@ -115,10 +141,9 @@ class TestParserLive(TestCase): ) @mock.patch("documents.loggers.LoggingMixin.log") # Disable log output def test_generate_pdf_from_mail(self, m): - # TODO mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml")) - pdf_path = os.path.join(self.parser.tempdir, "test_generate_pdf_from_mail.pdf") + pdf_path = os.path.join(self.parser.tempdir, "html.eml.pdf") with open(pdf_path, "wb") as file: file.write(self.parser.generate_pdf_from_mail(mail)) @@ -126,7 +151,7 @@ class TestParserLive(TestCase): converted = os.path.join( self.parser.tempdir, - "test_generate_pdf_from_mail.webp", + "html.eml.pdf.webp", ) run_convert( density=300, @@ -143,8 +168,8 @@ class TestParserLive(TestCase): thumb_hash = self.hashfile(converted) # The created pdf is not reproducible. But the converted image should always look the same. - expected_hash = ( - "8734a3f0a567979343824e468cd737bf29c02086bbfd8773e94feb986968ad32" + expected_hash = self.hashfile( + os.path.join(self.SAMPLE_FILES, "html.eml.pdf.webp"), ) self.assertEqual( thumb_hash, @@ -174,14 +199,14 @@ class TestParserLive(TestCase): ] result = self.parser.generate_pdf_from_html(html, attachments) - pdf_path = os.path.join(self.parser.tempdir, "test_generate_pdf_from_html.pdf") + pdf_path = os.path.join(self.parser.tempdir, "sample.html.pdf") with open(pdf_path, "wb") as file: file.write(result) file.close() extracted = extract_text(pdf_path) - expected = "Some Text\n\n This image should not be shown.\n\nand an embedded image.\n\nParagraph unchanged.\n\n\x0c" + expected = extract_text(os.path.join(self.SAMPLE_FILES, "sample.html.pdf")) self.assertEqual(expected, extracted) @pytest.mark.skipif(