mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Downgrade pdf validation to text only
This commit is contained in:
		| @@ -8,8 +8,8 @@ from urllib.request import urlopen | |||||||
| import pytest | import pytest | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| from documents.parsers import ParseError | from documents.parsers import ParseError | ||||||
| from documents.parsers import run_convert |  | ||||||
| from paperless_mail.parsers import MailDocumentParser | from paperless_mail.parsers import MailDocumentParser | ||||||
|  | from pdfminer.high_level import extract_text | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestParser(TestCase): | class TestParser(TestCase): | ||||||
| @@ -311,30 +311,9 @@ class TestParser(TestCase): | |||||||
|         pdf_path = parser.generate_pdf(os.path.join(self.SAMPLE_FILES, "html.eml")) |         pdf_path = parser.generate_pdf(os.path.join(self.SAMPLE_FILES, "html.eml")) | ||||||
|         self.assertTrue(os.path.isfile(pdf_path)) |         self.assertTrue(os.path.isfile(pdf_path)) | ||||||
|  |  | ||||||
|         converted = os.path.join(parser.tempdir, "test_generate_pdf.webp") |         extracted = extract_text(pdf_path) | ||||||
|         run_convert( |         expected = "From Name <someone@example.de>\n\n2022-10-15 09:23\n\nSubject HTML Message\n\nTo someone@example.de\n\nAttachments IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (0.59 MiB)\n\nSome Text \n\nand an embedded image.\n\n\x0cSome Text\n\n  This image should not be shown.\n\nand an embedded image.\n\nParagraph unchanged.\n\n\x0c" | ||||||
|             density=300, |         self.assertEqual(expected, extracted) | ||||||
|             scale="500x5000>", |  | ||||||
|             alpha="remove", |  | ||||||
|             strip=True, |  | ||||||
|             trim=False, |  | ||||||
|             auto_orient=True, |  | ||||||
|             input_file=f"{pdf_path}",  # Do net define an index to convert all pages. |  | ||||||
|             output_file=converted, |  | ||||||
|             logging_group=None, |  | ||||||
|         ) |  | ||||||
|         self.assertTrue(os.path.isfile(converted)) |  | ||||||
|         thumb_hash = self.hashfile(converted) |  | ||||||
|  |  | ||||||
|         # The created pdf is not reproducible. But the converted image should always look the same. |  | ||||||
|         expected_hash = ( |  | ||||||
|             "4f338619575a21c5227de003a14216b07ba00a372ca5f132745e974a1f990e09" |  | ||||||
|         ) |  | ||||||
|         self.assertEqual( |  | ||||||
|             thumb_hash, |  | ||||||
|             expected_hash, |  | ||||||
|             f"PDF looks different. Check if {converted} looks weird.", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def test_mail_to_html(self): |     def test_mail_to_html(self): | ||||||
|         parser = MailDocumentParser(None) |         parser = MailDocumentParser(None) | ||||||
| @@ -357,30 +336,9 @@ class TestParser(TestCase): | |||||||
|             file.write(parser.generate_pdf_from_mail(mail)) |             file.write(parser.generate_pdf_from_mail(mail)) | ||||||
|             file.close() |             file.close() | ||||||
|  |  | ||||||
|         converted = os.path.join(parser.tempdir, "test_generate_pdf_from_mail.webp") |         extracted = extract_text(pdf_path) | ||||||
|         run_convert( |         expected = "From Name <someone@example.de>\n\n2022-10-15 09:23\n\nSubject HTML Message\n\nTo someone@example.de\n\nAttachments IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (0.59 MiB)\n\nSome Text \n\nand an embedded image.\n\n\x0c" | ||||||
|             density=300, |         self.assertEqual(expected, extracted) | ||||||
|             scale="500x5000>", |  | ||||||
|             alpha="remove", |  | ||||||
|             strip=True, |  | ||||||
|             trim=False, |  | ||||||
|             auto_orient=True, |  | ||||||
|             input_file=f"{pdf_path}",  # Do net define an index to convert all pages. |  | ||||||
|             output_file=converted, |  | ||||||
|             logging_group=None, |  | ||||||
|         ) |  | ||||||
|         self.assertTrue(os.path.isfile(converted)) |  | ||||||
|         thumb_hash = self.hashfile(converted) |  | ||||||
|  |  | ||||||
|         # The created pdf is not reproducible. But the converted image should always look the same. |  | ||||||
|         expected_hash = ( |  | ||||||
|             "8734a3f0a567979343824e468cd737bf29c02086bbfd8773e94feb986968ad32" |  | ||||||
|         ) |  | ||||||
|         self.assertEqual( |  | ||||||
|             thumb_hash, |  | ||||||
|             expected_hash, |  | ||||||
|             f"PDF looks different. Check if {converted} looks weird.", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def test_transform_inline_html(self): |     def test_transform_inline_html(self): | ||||||
|         class MailAttachmentMock: |         class MailAttachmentMock: | ||||||
| @@ -432,31 +390,9 @@ class TestParser(TestCase): | |||||||
|             file.write(result) |             file.write(result) | ||||||
|             file.close() |             file.close() | ||||||
|  |  | ||||||
|         converted = os.path.join(parser.tempdir, "test_generate_pdf_from_html.webp") |         extracted = extract_text(pdf_path) | ||||||
|         run_convert( |         expected = "Some Text\n\n  This image should not be shown.\n\nand an embedded image.\n\nParagraph unchanged.\n\n\x0c" | ||||||
|             density=300, |         self.assertEqual(expected, extracted) | ||||||
|             scale="500x5000>", |  | ||||||
|             alpha="remove", |  | ||||||
|             strip=True, |  | ||||||
|             trim=False, |  | ||||||
|             auto_orient=True, |  | ||||||
|             input_file=f"{pdf_path}",  # Do net define an index to convert all pages. |  | ||||||
|             output_file=converted, |  | ||||||
|             logging_group=None, |  | ||||||
|         ) |  | ||||||
|         self.assertTrue(os.path.isfile(converted)) |  | ||||||
|         thumb_hash = self.hashfile(converted) |  | ||||||
|  |  | ||||||
|         # The created pdf is not reproducible. But the converted image should always look the same. |  | ||||||
|         expected_hash = ( |  | ||||||
|             "267d61f0ab8f128a037002a424b2cb4bfe18a81e17f0b70f15d241688ed47d1a" |  | ||||||
|         ) |  | ||||||
|         self.assertEqual( |  | ||||||
|             thumb_hash, |  | ||||||
|             expected_hash, |  | ||||||
|             f"PDF looks different. Check if {converted} looks weird. " |  | ||||||
|             f"If Rick Astley is shown, Gotenberg loads from web which is bad for Mail content.", |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     def test_is_online_image_still_available(self): |     def test_is_online_image_still_available(self): | ||||||
|         """ |         """ | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 phail
					phail