mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Adds a test to cover this edge case
This commit is contained in:
		
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/single-page-mixed.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/single-page-mixed.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -37,6 +37,9 @@ class FakeImageFile(ContextManager): | ||||
|  | ||||
|  | ||||
| class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|  | ||||
|     def assertContainsStrings(self, content, strings): | ||||
|         # Asserts that all strings appear in content, in the given order. | ||||
|         indices = [] | ||||
| @@ -47,14 +50,18 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|                 self.fail(f"'{s}' is not in '{content}'") | ||||
|         self.assertListEqual(indices, sorted(indices)) | ||||
|  | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ("simple    newline\n   testing string", "simple newline\ntesting string"), | ||||
|         ("utf-8   строка с пробелами в конце  ", "utf-8 строка с пробелами в конце"), | ||||
|     ] | ||||
|  | ||||
|     def test_post_process_text(self): | ||||
|         for source, result in self.text_cases: | ||||
|  | ||||
|         text_cases = [ | ||||
|             ("simple     string", "simple string"), | ||||
|             ("simple    newline\n   testing string", "simple newline\ntesting string"), | ||||
|             ( | ||||
|                 "utf-8   строка с пробелами в конце  ", | ||||
|                 "utf-8 строка с пробелами в конце", | ||||
|             ), | ||||
|         ] | ||||
|  | ||||
|         for source, result in text_cases: | ||||
|             actual_result = post_process_text(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
| @@ -66,8 +73,6 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|                 ), | ||||
|             ) | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|  | ||||
|     def test_get_text_from_pdf(self): | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         text = parser.extract_text( | ||||
| @@ -461,6 +466,45 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertIn("[OCR skipped on page(s) 4-6]", sidecar) | ||||
|  | ||||
|     @override_settings(OCR_MODE="redo") | ||||
|     def test_single_page_mixed(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - File with some text contained in images and some in text layer | ||||
|             - Text and images are mixed on the same page | ||||
|             - OCR mode set to redo | ||||
|         WHEN: | ||||
|             - Document is parsed | ||||
|         THEN: | ||||
|             - Text from images is extracted | ||||
|             - Full content of the file is parsed (not just the image text) | ||||
|             - An archive file is created with the OCRd text and the original text | ||||
|         """ | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"), | ||||
|             "application/pdf", | ||||
|         ) | ||||
|         self.assertIsNotNone(parser.archive_path) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), | ||||
|             [ | ||||
|                 "this is some normal text, present on page 1 of the document.", | ||||
|                 "this is some text, but in an image, also on page 1.", | ||||
|                 "this is further text on page 1.", | ||||
|             ], | ||||
|         ) | ||||
|  | ||||
|         with open(os.path.join(parser.tempdir, "sidecar.txt")) as f: | ||||
|             sidecar = f.read().lower() | ||||
|  | ||||
|         self.assertIn("this is some text, but in an image, also on page 1.", sidecar) | ||||
|         self.assertNotIn( | ||||
|             "this is some normal text, present on page 1 of the document.", | ||||
|             sidecar, | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_multi_page_mixed_no_archive(self): | ||||
|         """ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H