mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Adding more test coverage, in particular around Tika and its parser
This commit is contained in:
		| @@ -161,7 +161,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         except Exception: | ||||
|             # TODO catch all for various issues with PDFminer.six. | ||||
|             #  If PDFminer fails, fall back to OCR. | ||||
|             #  If pdftotext fails, fall back to OCR. | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 "Error while getting text from PDF document with " "pdfminer.six", | ||||
|   | ||||
| @@ -364,7 +364,7 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) | ||||
|         self.assertFalse("page 3" in parser.get_text().lower()) | ||||
|         self.assertNotIn("page 3", parser.get_text().lower()) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=1, OCR_MODE="force") | ||||
|     def test_multi_page_analog_pages_force(self): | ||||
| @@ -386,8 +386,8 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1"]) | ||||
|         self.assertFalse("page 2" in parser.get_text().lower()) | ||||
|         self.assertFalse("page 3" in parser.get_text().lower()) | ||||
|         self.assertNotIn("page 2", parser.get_text().lower()) | ||||
|         self.assertNotIn("page 3", parser.get_text().lower()) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_skip_noarchive_withtext(self): | ||||
| @@ -660,6 +660,15 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertNotIn("deskew", params) | ||||
|  | ||||
|         with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertIn("max_image_mpixels", params) | ||||
|             self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4) | ||||
|  | ||||
|         with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertNotIn("max_image_mpixels", params) | ||||
|  | ||||
|     def test_rtl_language_detection(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H