mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	testing the tesseract parser
This commit is contained in:
		
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 7.7 KiB | 
							
								
								
									
										221
									
								
								src/paperless_tesseract/tests/test_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										221
									
								
								src/paperless_tesseract/tests/test_parser.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,221 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| import uuid | ||||
| from typing import ContextManager | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase, override_settings | ||||
| from pyocr.error import TesseractError | ||||
|  | ||||
| from documents.parsers import ParseError, run_convert | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError | ||||
|  | ||||
| image_to_string_calls = [] | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise TesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_languages(): | ||||
|         return ['eng', 'deu'] | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         image_to_string_calls.append((file_handle.name, lang)) | ||||
|         return file_handle.read() | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| def fake_convert(input_file, output_file, **kwargs): | ||||
|     with open(input_file) as f: | ||||
|         lines = f.readlines() | ||||
|  | ||||
|     for i, line in enumerate(lines): | ||||
|         with open(output_file % i, "w") as f2: | ||||
|             f2.write(line.strip()) | ||||
|  | ||||
|  | ||||
| def fake_unpaper(pnm): | ||||
|     output = pnm + ".unpaper.pnm" | ||||
|     shutil.copy(pnm, output) | ||||
|     return output | ||||
|  | ||||
|  | ||||
| class FakeImageFile(ContextManager): | ||||
|     def __init__(self, fname): | ||||
|         self.fname = fname | ||||
|  | ||||
|     def __exit__(self, exc_type, exc_val, exc_tb): | ||||
|         pass | ||||
|  | ||||
|     def __enter__(self): | ||||
|         return os.path.basename(self.fname) | ||||
|  | ||||
|  | ||||
| fake_image = FakeImageFile | ||||
|  | ||||
|  | ||||
| @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||
| @mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) | ||||
| @mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) | ||||
| @mock.patch("paperless_tesseract.parsers.Image.open", open) | ||||
| class TestRasterisedDocumentParser(TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|  | ||||
|         global image_to_string_calls | ||||
|  | ||||
|         image_to_string_calls = [] | ||||
|  | ||||
|         override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch) | ||||
|  | ||||
|     def get_input_file(self, pages): | ||||
|         _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) | ||||
|         with open(fname, "w") as f: | ||||
|             f.writelines([f"line {p}\n" for p in range(pages)]) | ||||
|         return fname | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_simple_language_match(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_2_pages(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_3_pages(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) | ||||
|     def test_parse_text_lang_detect_failed(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") | ||||
|     def test_parse_text_lang_not_installed(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2 line 3") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") | ||||
|     def test_parse_text_lang_mismatch(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") | ||||
|     def test_parse_empty_doc(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) | ||||
|         try: | ||||
|             parser.get_text() | ||||
|         except ParseError as e: | ||||
|             self.assertEqual("Empty document, nothing to do.", str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|  | ||||
| class TestAuxilliaryFunctions(TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|  | ||||
|         override_settings(SCRATCH_DIR=self.scratch).enable() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch) | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|  | ||||
|     def test_get_text_from_pdf(self): | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf')) | ||||
|  | ||||
|         self.assertEqual(text.strip(), "This is a test document.") | ||||
|  | ||||
|     def test_get_text_from_pdf_error(self): | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) | ||||
|  | ||||
|         self.assertEqual(text.strip(), "") | ||||
|  | ||||
|     def test_image_to_string(self): | ||||
|         text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) | ||||
|  | ||||
|         self.assertEqual(text, "This is a test document.") | ||||
|  | ||||
|     def test_image_to_string_language_unavailable(self): | ||||
|         try: | ||||
|             image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) | ||||
|         except OCRError as e: | ||||
|             self.assertTrue("Failed loading language" in str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|     @override_settings(OCR_ALWAYS=False) | ||||
|     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") | ||||
|     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") | ||||
|     def test_is_ocred(self, m2, m): | ||||
|         parser = RasterisedDocumentParser("", uuid.uuid4()) | ||||
|         m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ | ||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " \ | ||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " | ||||
|         parser.get_text() | ||||
|         self.assertEqual(m.call_count, 2) | ||||
|         self.assertEqual(m2.call_count, 0) | ||||
|  | ||||
|     def test_thumbnail(self): | ||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) | ||||
|         parser.get_thumbnail() | ||||
|         # dont really know how to test it, just call it and assert that it does not raise anything. | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.run_convert") | ||||
|     def test_thumbnail_fallback(self, m): | ||||
|  | ||||
|         def call_convert(input_file, output_file, **kwargs): | ||||
|             if ".pdf" in input_file: | ||||
|                 raise ParseError("Does not compute.") | ||||
|             else: | ||||
|                 run_convert(input_file=input_file, output_file=output_file, **kwargs) | ||||
|  | ||||
|         m.side_effect = call_convert | ||||
|  | ||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) | ||||
|         parser.get_thumbnail() | ||||
|         # dont really know how to test it, just call it and assert that it does not raise anything. | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler