Fix for #154

* Added a test with a faked pyocr and tesseract * Added a catch for pyocr's *other* TesseractError
2026-02-11 23:59:31 -06:00 · 2016-11-27 15:06:45 +00:00
parent b88e0fd902
commit 18495ce9da
3 changed files with 65 additions and 21 deletions
--- a/src/documents/tests/samples/no-text.png
+++ b/src/documents/tests/samples/no-text.png
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,7 +1,13 @@
-from django.test import TestCase
+import os
+from unittest import mock, skipIf
+
+import pyocr
+from django.test import TestCase
+from pyocr.libtesseract.tesseract_raw import \
+    TesseractError as OtherTesseractError

-from ..consumer import strip_excess_whitespace
 from ..models import FileInfo
+from ..consumer import image_to_string, strip_excess_whitespace


 class TestAttributes(TestCase):
@@ -304,6 +310,28 @@ class TestFieldPermutations(TestCase):
                            template.format(**spec), **spec)


+class FakeTesseract(object):
+
+    @staticmethod
+    def can_detect_orientation():
+        return True
+
+    @staticmethod
+    def detect_orientation(file_handle, lang):
+        raise OtherTesseractError("arbitrary status", "message")
+
+    @staticmethod
+    def image_to_string(file_handle, lang):
+        return "This is test text"
+
+
+class FakePyOcr(object):
+
+    @staticmethod
+    def get_available_tools():
+        return [FakeTesseract]
+
+
 class TestOCR(TestCase):

    text_cases = [
@@ -317,6 +345,9 @@ class TestOCR(TestCase):
            "utf-8 строка с пробелами в конце"
        )
    ]
+    
+    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
+    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())

    def test_strip_excess_whitespace(self):
        for source, result in self.text_cases:
@@ -330,3 +361,18 @@ class TestOCR(TestCase):
                    actual_result
                )
            )
+
+    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
+    @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
+    @mock.patch("documents.consumer.pyocr", FakePyOcr)
+    def test_image_to_string_with_text_free_page(self):
+        """
+        This test is sort of silly, since it's really just reproducing an odd
+        exception thrown by pyocr when it encounters a page with no text.
+        Actually running this test against an installation of Tesseract results
+        in a segmentation fault rooted somewhere deep inside pyocr where I
+        don't care to dig.  Regardless, if you run the consumer normally,
+        text-free pages are now handled correctly so long as we work around
+        this weird exception.
+        """
+        image_to_string(["text.png", "en"])