Merge pull request #165 from danielquinn/fix/154

Fix for #154
This commit is contained in:
Daniel Quinn 2016-11-27 15:12:32 +00:00 committed by GitHub
commit 50896f48db
3 changed files with 65 additions and 21 deletions

View File

@ -1,33 +1,31 @@
import datetime
import hashlib
import logging
import tempfile
import uuid
from multiprocessing.pool import Pool
import itertools
import langdetect
import os
import re
import uuid
import shutil
import hashlib
import logging
import datetime
import tempfile
import itertools
import subprocess
from multiprocessing.pool import Pool
import pyocr
import shutil
import langdetect
from PIL import Image
from django.conf import settings
from django.utils import timezone
from pyocr.tesseract import TesseractError
from paperless.db import GnuPG
from pyocr.tesseract import TesseractError
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from .models import Tag, Document, FileInfo
from .languages import ISO639
from .signals import (
document_consumption_started, document_consumption_finished)
document_consumption_started,
document_consumption_finished
)
from .languages import ISO639
class OCRError(Exception):
@ -381,7 +379,7 @@ def image_to_string(args):
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except TesseractError:
except (TesseractError, OtherTesseractError):
pass
return ocr.image_to_string(f, lang=lang)

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

@ -1,7 +1,13 @@
from django.test import TestCase
import os
from unittest import mock, skipIf
import pyocr
from django.test import TestCase
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from ..consumer import strip_excess_whitespace
from ..models import FileInfo
from ..consumer import image_to_string, strip_excess_whitespace
class TestAttributes(TestCase):
@ -304,6 +310,28 @@ class TestFieldPermutations(TestCase):
template.format(**spec), **spec)
class FakeTesseract(object):
@staticmethod
def can_detect_orientation():
return True
@staticmethod
def detect_orientation(file_handle, lang):
raise OtherTesseractError("arbitrary status", "message")
@staticmethod
def image_to_string(file_handle, lang):
return "This is test text"
class FakePyOcr(object):
@staticmethod
def get_available_tools():
return [FakeTesseract]
class TestOCR(TestCase):
text_cases = [
@ -318,6 +346,9 @@ class TestOCR(TestCase):
)
]
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
@ -330,3 +361,18 @@ class TestOCR(TestCase):
actual_result
)
)
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
@mock.patch("documents.consumer.pyocr", FakePyOcr)
def test_image_to_string_with_text_free_page(self):
"""
This test is sort of silly, since it's really just reproducing an odd
exception thrown by pyocr when it encounters a page with no text.
Actually running this test against an installation of Tesseract results
in a segmentation fault rooted somewhere deep inside pyocr where I
don't care to dig. Regardless, if you run the consumer normally,
text-free pages are now handled correctly so long as we work around
this weird exception.
"""
image_to_string(["text.png", "en"])