mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	removed obsolete tests.
This commit is contained in:
		
										
											Binary file not shown.
										
									
								
							| Before Width: | Height: | Size: 32 KiB | 
| @@ -1,34 +1,9 @@ | ||||
| import os | ||||
| from unittest import mock, skipIf | ||||
|  | ||||
| import pyocr | ||||
| from django.test import TestCase | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
|  | ||||
| from ..parsers import image_to_string, strip_excess_whitespace | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise OtherTesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         return "This is test text" | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
| from ..parsers import strip_excess_whitespace | ||||
|  | ||||
|  | ||||
| class TestOCR(TestCase): | ||||
| @@ -45,9 +20,6 @@ class TestOCR(TestCase): | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||
|  | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
| @@ -60,17 +32,3 @@ class TestOCR(TestCase): | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||
|     @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||
|     def test_image_to_string_with_text_free_page(self): | ||||
|         """ | ||||
|         This test is sort of silly, since it's really just reproducing an odd | ||||
|         exception thrown by pyocr when it encounters a page with no text. | ||||
|         Actually running this test against an installation of Tesseract results | ||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||
|         don't care to dig.  Regardless, if you run the consumer normally, | ||||
|         text-free pages are now handled correctly so long as we work around | ||||
|         this weird exception. | ||||
|         """ | ||||
|         image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"]) | ||||
|   | ||||
| @@ -6,41 +6,13 @@ from typing import ContextManager | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase, override_settings | ||||
| from pyocr.error import TesseractError | ||||
|  | ||||
| from documents.parsers import ParseError, run_convert | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf | ||||
|  | ||||
| image_to_string_calls = [] | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise TesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_languages(): | ||||
|         return ['eng', 'deu'] | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         image_to_string_calls.append((file_handle.name, lang)) | ||||
|         return file_handle.read() | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| def fake_convert(input_file, output_file, **kwargs): | ||||
|     with open(input_file) as f: | ||||
|         lines = f.readlines() | ||||
| @@ -50,12 +22,6 @@ def fake_convert(input_file, output_file, **kwargs): | ||||
|             f2.write(line.strip()) | ||||
|  | ||||
|  | ||||
| def fake_unpaper(pnm): | ||||
|     output = pnm + ".unpaper.pnm" | ||||
|     shutil.copy(pnm, output) | ||||
|     return output | ||||
|  | ||||
|  | ||||
| class FakeImageFile(ContextManager): | ||||
|     def __init__(self, fname): | ||||
|         self.fname = fname | ||||
| @@ -67,92 +33,6 @@ class FakeImageFile(ContextManager): | ||||
|         return os.path.basename(self.fname) | ||||
|  | ||||
|  | ||||
| fake_image = FakeImageFile | ||||
|  | ||||
|  | ||||
| @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||
| @mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) | ||||
| @mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) | ||||
| @mock.patch("paperless_tesseract.parsers.Image.open", open) | ||||
| class TestRasterisedDocumentParser(TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|  | ||||
|         global image_to_string_calls | ||||
|  | ||||
|         image_to_string_calls = [] | ||||
|  | ||||
|         override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch) | ||||
|  | ||||
|     def get_input_file(self, pages): | ||||
|         _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) | ||||
|         with open(fname, "w") as f: | ||||
|             f.writelines([f"line {p}\n" for p in range(pages)]) | ||||
|         return fname | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_simple_language_match(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_2_pages(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_3_pages(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) | ||||
|     def test_parse_text_lang_detect_failed(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") | ||||
|     def test_parse_text_lang_not_installed(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2 line 3") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") | ||||
|     def test_parse_text_lang_mismatch(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") | ||||
|     def test_parse_empty_doc(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) | ||||
|         try: | ||||
|             parser.get_text() | ||||
|         except ParseError as e: | ||||
|             self.assertEqual("Empty document, nothing to do.", str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|  | ||||
| class TestAuxilliaryFunctions(TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
| @@ -173,32 +53,7 @@ class TestAuxilliaryFunctions(TestCase): | ||||
|     def test_get_text_from_pdf_error(self): | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) | ||||
|  | ||||
|         self.assertEqual(text.strip(), "") | ||||
|  | ||||
|     def test_image_to_string(self): | ||||
|         text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) | ||||
|  | ||||
|         self.assertEqual(text, "This is a test document.") | ||||
|  | ||||
|     def test_image_to_string_language_unavailable(self): | ||||
|         try: | ||||
|             image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) | ||||
|         except OCRError as e: | ||||
|             self.assertTrue("Failed loading language" in str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|     @override_settings(OCR_ALWAYS=False) | ||||
|     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") | ||||
|     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") | ||||
|     def test_is_ocred(self, m2, m): | ||||
|         parser = RasterisedDocumentParser("", uuid.uuid4()) | ||||
|         m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ | ||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " \ | ||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " | ||||
|         parser.get_text() | ||||
|         self.assertEqual(m.call_count, 2) | ||||
|         self.assertEqual(m2.call_count, 0) | ||||
|         self.assertIsNone(text) | ||||
|  | ||||
|     def test_thumbnail(self): | ||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler