* Added a test with a faked pyocr and tesseract
* Added a catch for pyocr's *other* TesseractError
This commit is contained in:
Daniel Quinn
2016-11-27 15:06:45 +00:00
parent b88e0fd902
commit 18495ce9da
3 changed files with 65 additions and 21 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

@@ -1,7 +1,13 @@
from django.test import TestCase
import os
from unittest import mock, skipIf
import pyocr
from django.test import TestCase
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from ..consumer import strip_excess_whitespace
from ..models import FileInfo
from ..consumer import image_to_string, strip_excess_whitespace
class TestAttributes(TestCase):
@@ -304,6 +310,28 @@ class TestFieldPermutations(TestCase):
template.format(**spec), **spec)
class FakeTesseract(object):
@staticmethod
def can_detect_orientation():
return True
@staticmethod
def detect_orientation(file_handle, lang):
raise OtherTesseractError("arbitrary status", "message")
@staticmethod
def image_to_string(file_handle, lang):
return "This is test text"
class FakePyOcr(object):
@staticmethod
def get_available_tools():
return [FakeTesseract]
class TestOCR(TestCase):
text_cases = [
@@ -317,6 +345,9 @@ class TestOCR(TestCase):
"utf-8 строка с пробелами в конце"
)
]
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
@@ -330,3 +361,18 @@ class TestOCR(TestCase):
actual_result
)
)
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
@mock.patch("documents.consumer.pyocr", FakePyOcr)
def test_image_to_string_with_text_free_page(self):
"""
This test is sort of silly, since it's really just reproducing an odd
exception thrown by pyocr when it encounters a page with no text.
Actually running this test against an installation of Tesseract results
in a segmentation fault rooted somewhere deep inside pyocr where I
don't care to dig. Regardless, if you run the consumer normally,
text-free pages are now handled correctly so long as we work around
this weird exception.
"""
image_to_string(["text.png", "en"])