Merge pull request #197 from danielquinn/pluggable-consumers

Pluggable consumers
This commit is contained in:
Daniel Quinn 2017-03-25 15:20:48 +00:00 committed by GitHub
commit b7cb708053
21 changed files with 454 additions and 293 deletions

View File

@ -8,7 +8,9 @@ matrix:
env: TOXENV=py34
- python: 3.5
env: TOXENV=py35
- python: 3.5
- python: 3.6
env: TOXENV=py36
- python: 3.6
env: TOXENV=pep8
install:

View File

@ -4,6 +4,14 @@ Changelog
* 0.3.6
* Fix for `#200`_ (!!) where the API wasn't configured to allow updating the
correspondent or the tags for a document.
* The ``content`` field is now optional, to allow for the edge case of a
purely graphical document.
* You can no longer add documents via the admin. This never worked in the
first place, so all I've done here is remove the link to the broken form.
* The consumer code has been heavily refactored to support a pluggable
interface. Install a paperless consumer via pip and tell paperless about
it with an environment variable, and you're good to go. Proper
documentation is on its way.
* 0.3.5
* A serious facelift for the documents listing page wherein we drop the

View File

@ -67,6 +67,7 @@ class DocumentAdmin(CommonAdmin):
def created_(self, obj):
return obj.created.date().strftime("%Y-%m-%d")
created_.short_description = "Created"
def thumbnail(self, obj):
png_img = self._html_tag(

View File

@ -1,35 +1,21 @@
import datetime
import hashlib
import logging
import os
import re
import uuid
import shutil
import hashlib
import logging
import datetime
import tempfile
import itertools
import subprocess
from multiprocessing.pool import Pool
import pyocr
import langdetect
from PIL import Image
from django.conf import settings
from django.utils import timezone
from paperless.db import GnuPG
from pyocr.tesseract import TesseractError
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from .models import Tag, Document, FileInfo
from .models import Document, FileInfo, Tag
from .parsers import ParseError
from .signals import (
document_consumption_started,
document_consumption_finished
document_consumer_declaration,
document_consumption_finished,
document_consumption_started
)
from .languages import ISO639
class OCRError(Exception):
pass
class ConsumerError(Exception):
@ -47,13 +33,7 @@ class Consumer(object):
"""
SCRATCH = settings.SCRATCH_DIR
CONVERT = settings.CONVERT_BINARY
UNPAPER = settings.UNPAPER_BINARY
CONSUME = settings.CONSUMPTION_DIR
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
def __init__(self):
@ -78,6 +58,16 @@ class Consumer(object):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.CONSUME))
self.parsers = []
for response in document_consumer_declaration.send(self):
self.parsers.append(response[1])
if not self.parsers:
raise ConsumerError(
"No parsers could be found, not even the default. "
"This is a problem."
)
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
@ -109,6 +99,13 @@ class Consumer(object):
self._ignore.append(doc)
continue
parser_class = self._get_parser_class(doc)
if not parser_class:
self.log(
"info", "No parsers could be found for {}".format(doc))
self._ignore.append(doc)
continue
self.logging_group = uuid.uuid4()
self.log("info", "Consuming {}".format(doc))
@ -119,25 +116,26 @@ class Consumer(object):
logging_group=self.logging_group
)
tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
imgs = self._get_greyscale(tempdir, doc)
thumbnail = self._get_thumbnail(tempdir, doc)
parsed_document = parser_class(doc)
thumbnail = parsed_document.get_thumbnail()
try:
document = self._store(self._get_ocr(imgs), doc, thumbnail)
except OCRError as e:
document = self._store(
parsed_document.get_text(),
doc,
thumbnail
)
except ParseError as e:
self._ignore.append(doc)
self.log("error", "OCR FAILURE for {}: {}".format(doc, e))
self._cleanup_tempdir(tempdir)
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
parsed_document.cleanup()
continue
else:
self._cleanup_tempdir(tempdir)
parsed_document.cleanup()
self._cleanup_doc(doc)
self.log(
@ -151,142 +149,20 @@ class Consumer(object):
logging_group=self.logging_group
)
def _get_greyscale(self, tempdir, doc):
def _get_parser_class(self, doc):
"""
Greyscale images are easier for Tesseract to OCR
Determine the appropriate parser class based on the file
"""
self.log("info", "Generating greyscale image from {}".format(doc))
options = []
for parser in self.parsers:
result = parser(doc)
if result:
options.append(result)
# Convert PDF to multiple PNMs
pnm = os.path.join(tempdir, "convert-%04d.pnm")
run_convert(
self.CONVERT,
"-density", str(self.DENSITY),
"-depth", "8",
"-type", "grayscale",
doc, pnm,
)
# Get a list of converted images
pnms = []
for f in os.listdir(tempdir):
if f.endswith(".pnm"):
pnms.append(os.path.join(tempdir, f))
# Run unpaper in parallel on converted images
with Pool(processes=self.THREADS) as pool:
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
# Return list of converted images, processed with unpaper
pnms = []
for f in os.listdir(tempdir):
if f.endswith(".unpaper.pnm"):
pnms.append(os.path.join(tempdir, f))
return sorted(filter(lambda __: os.path.isfile(__), pnms))
def _get_thumbnail(self, tempdir, doc):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
self.log("info", "Generating the thumbnail")
run_convert(
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
doc, os.path.join(tempdir, "convert-%04d.png")
)
return os.path.join(tempdir, "convert-0000.png")
def _guess_language(self, text):
try:
guess = langdetect.detect(text)
self.log("debug", "Language detected: {}".format(guess))
return guess
except Exception as e:
self.log("warning", "Language detection error: {}".format(e))
def _get_ocr(self, imgs):
"""
Attempts to do the best job possible OCR'ing the document based on
simple language detection trial & error.
"""
if not imgs:
raise OCRError("No images found")
self.log("info", "OCRing the document")
# Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1
middle = int(len(imgs) / 2)
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
guessed_language = self._guess_language(raw_text)
if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed!")
if settings.FORGIVING_OCR:
self.log(
"warning",
"As FORGIVING_OCR is enabled, we're going to make the "
"best with what we have."
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError("Language detection failed")
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
try:
return self._ocr(imgs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self.log(
"warning",
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
)
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError(
"The guessed language is not available in this instance of "
"Tesseract."
)
def _assemble_ocr_sections(self, imgs, middle, text):
"""
Given a `middle` value and the text that middle page represents, we OCR
the remainder of the document and return the whole thing.
"""
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text
def _ocr(self, imgs, lang):
"""
Performs a single OCR attempt.
"""
if not imgs:
return ""
self.log("info", "Parsing for {}".format(lang))
with Pool(processes=self.THREADS) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
r = " ".join(r)
# Strip out excess white space to allow matching to go smoother
return strip_excess_whitespace(r)
# Return the parser with the highest weight.
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def _store(self, text, doc, thumbnail):
@ -332,10 +208,6 @@ class Consumer(object):
return document
def _cleanup_tempdir(self, d):
self.log("debug", "Deleting directory {}".format(d))
shutil.rmtree(d)
def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)
@ -361,41 +233,3 @@ class Consumer(object):
with open(doc, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
return Document.objects.filter(checksum=checksum).exists()
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace
def image_to_string(args):
img, lang = args
ocr = pyocr.get_available_tools()[0]
with Image.open(os.path.join(Consumer.SCRATCH, img)) as f:
if ocr.can_detect_orientation():
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except (TesseractError, OtherTesseractError):
pass
return ocr.image_to_string(f, lang=lang)
def run_unpaper(args):
unpaper, pnm = args
subprocess.Popen(
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
def run_convert(*args):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
subprocess.Popen(args, env=environment).wait()

View File

@ -158,13 +158,22 @@ class Document(models.Model):
correspondent = models.ForeignKey(
Correspondent, blank=True, null=True, related_name="documents")
title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True)
content = models.TextField(
db_index=True,
blank=True,
help_text="The raw, text-only data of the document. This field is "
"primarily used for searching."
)
file_type = models.CharField(
max_length=4,
editable=False,
choices=tuple([(t, t.upper()) for t in TYPES])
)
tags = models.ManyToManyField(
Tag, related_name="documents", blank=True)

45
src/documents/parsers.py Normal file
View File

@ -0,0 +1,45 @@
import logging
import shutil
import tempfile
from django.conf import settings
class ParseError(Exception):
pass
class DocumentParser(object):
"""
Subclass this to make your own parser. Have a look at
`paperless_tesseract.parsers` for inspiration.
"""
SCRATCH = settings.SCRATCH_DIR
def __init__(self, path):
self.document_path = path
self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
self.logger = logging.getLogger(__name__)
self.logging_group = None
def get_thumbnail(self):
"""
Returns the path to a file we can use as a thumbnail for this document.
"""
raise NotImplementedError()
def get_text(self):
"""
Returns the text from the document and only the text.
"""
raise NotImplementedError()
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
})
def cleanup(self):
self.log("debug", "Deleting directory {}".format(self.tempdir))
shutil.rmtree(self.tempdir)

View File

@ -2,3 +2,4 @@ from django.dispatch import Signal
document_consumption_started = Signal(providing_args=["filename"])
document_consumption_finished = Signal(providing_args=["document"])
document_consumer_declaration = Signal(providing_args=[])

View File

@ -1,6 +1,5 @@
import logging
import os
from subprocess import Popen
from django.conf import settings

View File

@ -158,7 +158,7 @@
<script>
// We nee to re-build the select-all functionality as the old logic pointed
// We need to re-build the select-all functionality as the old logic pointed
// to a table and we're using divs now.
django.jQuery("#action-toggle").on("change", function(){
django.jQuery(".grid .box .result .checkbox input")

View File

@ -1,13 +1,6 @@
import os
from unittest import mock, skipIf
import pyocr
from django.test import TestCase
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from ..models import FileInfo
from ..consumer import image_to_string, strip_excess_whitespace
class TestAttributes(TestCase):
@ -308,71 +301,3 @@ class TestFieldPermutations(TestCase):
}
self._test_guessed_attributes(
template.format(**spec), **spec)
class FakeTesseract(object):
@staticmethod
def can_detect_orientation():
return True
@staticmethod
def detect_orientation(file_handle, lang):
raise OtherTesseractError("arbitrary status", "message")
@staticmethod
def image_to_string(file_handle, lang):
return "This is test text"
class FakePyOcr(object):
@staticmethod
def get_available_tools():
return [FakeTesseract]
class TestOCR(TestCase):
text_cases = [
("simple string", "simple string"),
(
"simple newline\n testing string",
"simple newline\ntesting string"
),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце"
)
]
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
self.assertEqual(
result,
actual_result,
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
source,
result,
actual_result
)
)
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
@mock.patch("documents.consumer.pyocr", FakePyOcr)
def test_image_to_string_with_text_free_page(self):
"""
This test is sort of silly, since it's really just reproducing an odd
exception thrown by pyocr when it encounters a page with no text.
Actually running this test against an installation of Tesseract results
in a segmentation fault rooted somewhere deep inside pyocr where I
don't care to dig. Regardless, if you run the consumer normally,
text-free pages are now handled correctly so long as we work around
this weird exception.
"""
image_to_string(["no-text.png", "en"])

View File

@ -61,6 +61,7 @@ INSTALLED_APPS = [
"django_extensions",
"documents.apps.DocumentsConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
"flat_responsive",
"django.contrib.admin",
@ -70,6 +71,9 @@ INSTALLED_APPS = [
]
if os.getenv("PAPERLESS_INSTALLED_APPS"):
INSTALLED_APPS += os.getenv("PAPERLESS_INSTALLED_APPS").split(",")
MIDDLEWARE_CLASSES = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',

View File

@ -1 +1 @@
__version__ = (0, 3, 5)
__version__ = (0, 3, 6)

View File

View File

@ -0,0 +1,16 @@
from django.apps import AppConfig
class PaperlessTesseractConfig(AppConfig):
name = "paperless_tesseract"
def ready(self):
from documents.signals import document_consumer_declaration
from .signals import ConsumerDeclaration
document_consumer_declaration.connect(ConsumerDeclaration.handle)
AppConfig.ready(self)

View File

@ -0,0 +1,214 @@
import itertools
import os
import re
import subprocess
from multiprocessing.pool import Pool
import langdetect
import pyocr
from django.conf import settings
from documents.parsers import DocumentParser, ParseError
from PIL import Image
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from pyocr.tesseract import TesseractError
from .languages import ISO639
class OCRError(Exception):
pass
class RasterisedDocumentParser(DocumentParser):
"""
This parser uses Tesseract to try and get some text out of a rasterised
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
"""
CONVERT = settings.CONVERT_BINARY
DENSITY = settings.CONVERT_DENSITY if settings.CONVERT_DENSITY else 300
THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None
UNPAPER = settings.UNPAPER_BINARY
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
def get_thumbnail(self):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
run_convert(
self.CONVERT,
"-scale", "500x5000",
"-alpha", "remove",
self.document_path, os.path.join(self.tempdir, "convert-%04d.png")
)
return os.path.join(self.tempdir, "convert-0000.png")
def get_text(self):
images = self._get_greyscale()
try:
return self._get_ocr(images)
except OCRError as e:
raise ParseError(e)
def _get_greyscale(self):
"""
Greyscale images are easier for Tesseract to OCR
"""
# Convert PDF to multiple PNMs
pnm = os.path.join(self.tempdir, "convert-%04d.pnm")
run_convert(
self.CONVERT,
"-density", str(self.DENSITY),
"-depth", "8",
"-type", "grayscale",
self.document_path, pnm,
)
# Get a list of converted images
pnms = []
for f in os.listdir(self.tempdir):
if f.endswith(".pnm"):
pnms.append(os.path.join(self.tempdir, f))
# Run unpaper in parallel on converted images
with Pool(processes=self.THREADS) as pool:
pool.map(run_unpaper, itertools.product([self.UNPAPER], pnms))
# Return list of converted images, processed with unpaper
pnms = []
for f in os.listdir(self.tempdir):
if f.endswith(".unpaper.pnm"):
pnms.append(os.path.join(self.tempdir, f))
return sorted(filter(lambda __: os.path.isfile(__), pnms))
def _guess_language(self, text):
try:
guess = langdetect.detect(text)
self.log("debug", "Language detected: {}".format(guess))
return guess
except Exception as e:
self.log("warning", "Language detection error: {}".format(e))
def _get_ocr(self, imgs):
"""
Attempts to do the best job possible OCR'ing the document based on
simple language detection trial & error.
"""
if not imgs:
raise OCRError("No images found")
self.log("info", "OCRing the document")
# Since the division gets rounded down by int, this calculation works
# for every edge-case, i.e. 1
middle = int(len(imgs) / 2)
raw_text = self._ocr([imgs[middle]], self.DEFAULT_OCR_LANGUAGE)
guessed_language = self._guess_language(raw_text)
if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed!")
if settings.FORGIVING_OCR:
self.log(
"warning",
"As FORGIVING_OCR is enabled, we're going to make the "
"best with what we have."
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError("Language detection failed")
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
try:
return self._ocr(imgs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self.log(
"warning",
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
)
)
raw_text = self._assemble_ocr_sections(imgs, middle, raw_text)
return raw_text
raise OCRError(
"The guessed language is not available in this instance of "
"Tesseract."
)
def _ocr(self, imgs, lang):
"""
Performs a single OCR attempt.
"""
if not imgs:
return ""
self.log("info", "Parsing for {}".format(lang))
with Pool(processes=self.THREADS) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
r = " ".join(r)
# Strip out excess white space to allow matching to go smoother
return strip_excess_whitespace(r)
def _assemble_ocr_sections(self, imgs, middle, text):
"""
Given a `middle` value and the text that middle page represents, we OCR
the remainder of the document and return the whole thing.
"""
text = self._ocr(imgs[:middle], self.DEFAULT_OCR_LANGUAGE) + text
text += self._ocr(imgs[middle + 1:], self.DEFAULT_OCR_LANGUAGE)
return text
def run_convert(*args):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
subprocess.Popen(args, env=environment).wait()
def run_unpaper(args):
unpaper, pnm = args
subprocess.Popen(
(unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm"))).wait()
def strip_excess_whitespace(text):
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
return no_trailing_whitespace
def image_to_string(args):
img, lang = args
ocr = pyocr.get_available_tools()[0]
with Image.open(os.path.join(RasterisedDocumentParser.SCRATCH, img)) as f:
if ocr.can_detect_orientation():
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except (TesseractError, OtherTesseractError):
pass
return ocr.image_to_string(f, lang=lang)

View File

@ -0,0 +1,23 @@
import re
from .parsers import RasterisedDocumentParser
class ConsumerDeclaration(object):
MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$")
@classmethod
def handle(cls, sender, **kwargs):
return cls.test
@classmethod
def test(cls, doc):
if cls.MATCHING_FILES.match(doc):
return {
"parser": RasterisedDocumentParser,
"weight": 0
}
return None

View File

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 32 KiB

View File

@ -0,0 +1,80 @@
import os
from unittest import mock, skipIf
import pyocr
from django.test import TestCase
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
from ..parsers import image_to_string, strip_excess_whitespace
class FakeTesseract(object):
@staticmethod
def can_detect_orientation():
return True
@staticmethod
def detect_orientation(file_handle, lang):
raise OtherTesseractError("arbitrary status", "message")
@staticmethod
def image_to_string(file_handle, lang):
return "This is test text"
class FakePyOcr(object):
@staticmethod
def get_available_tools():
return [FakeTesseract]
class TestOCR(TestCase):
text_cases = [
("simple string", "simple string"),
(
"simple newline\n testing string",
"simple newline\ntesting string"
),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце"
)
]
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
self.assertEqual(
result,
actual_result,
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
source,
result,
actual_result
)
)
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
@mock.patch(
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
SAMPLE_FILES
)
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
def test_image_to_string_with_text_free_page(self):
"""
This test is sort of silly, since it's really just reproducing an odd
exception thrown by pyocr when it encounters a page with no text.
Actually running this test against an installation of Tesseract results
in a segmentation fault rooted somewhere deep inside pyocr where I
don't care to dig. Regardless, if you run the consumer normally,
text-free pages are now handled correctly so long as we work around
this weird exception.
"""
image_to_string(["no-text.png", "en"])

View File

@ -5,7 +5,7 @@
[tox]
skipsdist = True
envlist = py34, py35, pep8
envlist = py34, py35, py36, pep8
[testenv]
commands = {envpython} manage.py test