mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	optimize regex
This commit is contained in:
		| @@ -105,9 +105,8 @@ class MailDocumentParser(DocumentParser): | ||||
|  | ||||
|     def parse(self, document_path, mime_type, file_name=None): | ||||
|         def strip_text(text: str): | ||||
|             text = re.sub("\t", " ", text) | ||||
|             text = re.sub(" +", " ", text) | ||||
|             text = re.sub("(\n *)+", "\n", text) | ||||
|             text = re.sub(r"\s+", " ", text) | ||||
|             text = re.sub(r"(\n *)+", "\n", text) | ||||
|             return text.strip() | ||||
|  | ||||
|         mail = self.get_parsed(document_path) | ||||
|   | ||||
| @@ -227,7 +227,7 @@ class TestParser(TestCase): | ||||
|     @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf") | ||||
|     def test_parse_html_eml(self, n, mock_tika_parse: mock.MagicMock): | ||||
|         # Validate parsing returns the expected results | ||||
|         text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return" | ||||
|         text_expected = "Some Text and an embedded image.\n\nSubject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return" | ||||
|         mock_tika_parse.return_value = "tika return" | ||||
|  | ||||
|         self.parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 phail
					phail