diff --git a/src/paperless_tesseract/tests/samples/no-text.png b/src/paperless_tesseract/tests/samples/no-text.png deleted file mode 100644 index e78b22bfb..000000000 Binary files a/src/paperless_tesseract/tests/samples/no-text.png and /dev/null differ diff --git a/src/paperless_tesseract/tests/test_ocr.py b/src/paperless_tesseract/tests/test_ocr.py index e0d5726ba..7124fbed6 100644 --- a/src/paperless_tesseract/tests/test_ocr.py +++ b/src/paperless_tesseract/tests/test_ocr.py @@ -1,34 +1,9 @@ import os from unittest import mock, skipIf -import pyocr from django.test import TestCase -from pyocr.libtesseract.tesseract_raw import \ - TesseractError as OtherTesseractError -from ..parsers import image_to_string, strip_excess_whitespace - - -class FakeTesseract(object): - - @staticmethod - def can_detect_orientation(): - return True - - @staticmethod - def detect_orientation(file_handle, lang): - raise OtherTesseractError("arbitrary status", "message") - - @staticmethod - def image_to_string(file_handle, lang): - return "This is test text" - - -class FakePyOcr(object): - - @staticmethod - def get_available_tools(): - return [FakeTesseract] +from ..parsers import strip_excess_whitespace class TestOCR(TestCase): @@ -45,9 +20,6 @@ class TestOCR(TestCase): ) ] - SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") - TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) - def test_strip_excess_whitespace(self): for source, result in self.text_cases: actual_result = strip_excess_whitespace(source) @@ -60,17 +32,3 @@ class TestOCR(TestCase): actual_result ) ) - - @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") - @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) - def test_image_to_string_with_text_free_page(self): - """ - This test is sort of silly, since it's really just reproducing an odd - exception thrown by pyocr when it encounters a page with no text. - Actually running this test against an installation of Tesseract results - in a segmentation fault rooted somewhere deep inside pyocr where I - don't care to dig. Regardless, if you run the consumer normally, - text-free pages are now handled correctly so long as we work around - this weird exception. - """ - image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"]) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 6d4323fc2..bc37b0b84 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -6,41 +6,13 @@ from typing import ContextManager from unittest import mock from django.test import TestCase, override_settings -from pyocr.error import TesseractError from documents.parsers import ParseError, run_convert -from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError +from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf image_to_string_calls = [] -class FakeTesseract(object): - - @staticmethod - def can_detect_orientation(): - return True - - @staticmethod - def detect_orientation(file_handle, lang): - raise TesseractError("arbitrary status", "message") - - @staticmethod - def get_available_languages(): - return ['eng', 'deu'] - - @staticmethod - def image_to_string(file_handle, lang): - image_to_string_calls.append((file_handle.name, lang)) - return file_handle.read() - - -class FakePyOcr(object): - - @staticmethod - def get_available_tools(): - return [FakeTesseract] - - def fake_convert(input_file, output_file, **kwargs): with open(input_file) as f: lines = f.readlines() @@ -50,12 +22,6 @@ def fake_convert(input_file, output_file, **kwargs): f2.write(line.strip()) -def fake_unpaper(pnm): - output = pnm + ".unpaper.pnm" - shutil.copy(pnm, output) - return output - - class FakeImageFile(ContextManager): def __init__(self, fname): self.fname = fname @@ -67,92 +33,6 @@ class FakeImageFile(ContextManager): return os.path.basename(self.fname) -fake_image = FakeImageFile - - -@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) -@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) -@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) -@mock.patch("paperless_tesseract.parsers.Image.open", open) -class TestRasterisedDocumentParser(TestCase): - - def setUp(self): - self.scratch = tempfile.mkdtemp() - - global image_to_string_calls - - image_to_string_calls = [] - - override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() - - def tearDown(self): - shutil.rmtree(self.scratch) - - def get_input_file(self, pages): - _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) - with open(fname, "w") as f: - f.writelines([f"line {p}\n" for p in range(pages)]) - return fname - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_simple_language_match(self): - parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_2_pages(self): - parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_3_pages(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) - def test_parse_text_lang_detect_failed(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") - def test_parse_text_lang_not_installed(self): - parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2 line 3") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") - def test_parse_text_lang_mismatch(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") - def test_parse_empty_doc(self): - parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) - try: - parser.get_text() - except ParseError as e: - self.assertEqual("Empty document, nothing to do.", str(e)) - else: - self.fail("Should raise exception") - - class TestAuxilliaryFunctions(TestCase): def setUp(self): @@ -173,32 +53,7 @@ class TestAuxilliaryFunctions(TestCase): def test_get_text_from_pdf_error(self): text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) - self.assertEqual(text.strip(), "") - - def test_image_to_string(self): - text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) - - self.assertEqual(text, "This is a test document.") - - def test_image_to_string_language_unavailable(self): - try: - image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) - except OCRError as e: - self.assertTrue("Failed loading language" in str(e)) - else: - self.fail("Should raise exception") - - @override_settings(OCR_ALWAYS=False) - @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") - @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") - def test_is_ocred(self, m2, m): - parser = RasterisedDocumentParser("", uuid.uuid4()) - m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ - "lots of text lots of text lots of text lots of text lots of text lots of text " \ - "lots of text lots of text lots of text lots of text lots of text lots of text " - parser.get_text() - self.assertEqual(m.call_count, 2) - self.assertEqual(m2.call_count, 0) + self.assertIsNone(text) def test_thumbnail(self): parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())