diff --git a/src/paperless_tesseract/tests/samples/simple.pdf b/src/paperless_tesseract/tests/samples/simple.pdf new file mode 100644 index 000000000..e450de482 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/simple.pdf differ diff --git a/src/paperless_tesseract/tests/samples/simple.png b/src/paperless_tesseract/tests/samples/simple.png new file mode 100644 index 000000000..a3a768401 Binary files /dev/null and b/src/paperless_tesseract/tests/samples/simple.png differ diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py new file mode 100644 index 000000000..6d4323fc2 --- /dev/null +++ b/src/paperless_tesseract/tests/test_parser.py @@ -0,0 +1,221 @@ +import os +import shutil +import tempfile +import uuid +from typing import ContextManager +from unittest import mock + +from django.test import TestCase, override_settings +from pyocr.error import TesseractError + +from documents.parsers import ParseError, run_convert +from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError + +image_to_string_calls = [] + + +class FakeTesseract(object): + + @staticmethod + def can_detect_orientation(): + return True + + @staticmethod + def detect_orientation(file_handle, lang): + raise TesseractError("arbitrary status", "message") + + @staticmethod + def get_available_languages(): + return ['eng', 'deu'] + + @staticmethod + def image_to_string(file_handle, lang): + image_to_string_calls.append((file_handle.name, lang)) + return file_handle.read() + + +class FakePyOcr(object): + + @staticmethod + def get_available_tools(): + return [FakeTesseract] + + +def fake_convert(input_file, output_file, **kwargs): + with open(input_file) as f: + lines = f.readlines() + + for i, line in enumerate(lines): + with open(output_file % i, "w") as f2: + f2.write(line.strip()) + + +def fake_unpaper(pnm): + output = pnm + ".unpaper.pnm" + shutil.copy(pnm, output) + return output + + +class FakeImageFile(ContextManager): + def __init__(self, fname): + self.fname = fname + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + def __enter__(self): + return os.path.basename(self.fname) + + +fake_image = FakeImageFile + + +@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) +@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) +@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) +@mock.patch("paperless_tesseract.parsers.Image.open", open) +class TestRasterisedDocumentParser(TestCase): + + def setUp(self): + self.scratch = tempfile.mkdtemp() + + global image_to_string_calls + + image_to_string_calls = [] + + override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() + + def tearDown(self): + shutil.rmtree(self.scratch) + + def get_input_file(self, pages): + _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) + with open(fname, "w") as f: + f.writelines([f"line {p}\n" for p in range(pages)]) + return fname + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") + def test_parse_text_simple_language_match(self): + parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") + def test_parse_text_2_pages(self): + parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") + def test_parse_text_3_pages(self): + parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1 line 2") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) + def test_parse_text_lang_detect_failed(self): + parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1 line 2") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") + def test_parse_text_lang_not_installed(self): + parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1 line 2 line 3") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") + def test_parse_text_lang_mismatch(self): + parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) + text = parser.get_text() + self.assertEqual(text, "line 0 line 1 line 2") + + self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) + + @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") + def test_parse_empty_doc(self): + parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) + try: + parser.get_text() + except ParseError as e: + self.assertEqual("Empty document, nothing to do.", str(e)) + else: + self.fail("Should raise exception") + + +class TestAuxilliaryFunctions(TestCase): + + def setUp(self): + self.scratch = tempfile.mkdtemp() + + override_settings(SCRATCH_DIR=self.scratch).enable() + + def tearDown(self): + shutil.rmtree(self.scratch) + + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + + def test_get_text_from_pdf(self): + text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf')) + + self.assertEqual(text.strip(), "This is a test document.") + + def test_get_text_from_pdf_error(self): + text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) + + self.assertEqual(text.strip(), "") + + def test_image_to_string(self): + text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) + + self.assertEqual(text, "This is a test document.") + + def test_image_to_string_language_unavailable(self): + try: + image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) + except OCRError as e: + self.assertTrue("Failed loading language" in str(e)) + else: + self.fail("Should raise exception") + + @override_settings(OCR_ALWAYS=False) + @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") + @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") + def test_is_ocred(self, m2, m): + parser = RasterisedDocumentParser("", uuid.uuid4()) + m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ + "lots of text lots of text lots of text lots of text lots of text lots of text " \ + "lots of text lots of text lots of text lots of text lots of text lots of text " + parser.get_text() + self.assertEqual(m.call_count, 2) + self.assertEqual(m2.call_count, 0) + + def test_thumbnail(self): + parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) + parser.get_thumbnail() + # dont really know how to test it, just call it and assert that it does not raise anything. + + @mock.patch("paperless_tesseract.parsers.run_convert") + def test_thumbnail_fallback(self, m): + + def call_convert(input_file, output_file, **kwargs): + if ".pdf" in input_file: + raise ParseError("Does not compute.") + else: + run_convert(input_file=input_file, output_file=output_file, **kwargs) + + m.side_effect = call_convert + + parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) + parser.get_thumbnail() + # dont really know how to test it, just call it and assert that it does not raise anything.