diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 147b45edd..02397c118 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,33 +1,31 @@ -import datetime -import hashlib -import logging -import tempfile -import uuid - -from multiprocessing.pool import Pool - -import itertools - -import langdetect import os import re +import uuid +import shutil +import hashlib +import logging +import datetime +import tempfile +import itertools import subprocess +from multiprocessing.pool import Pool import pyocr -import shutil - +import langdetect from PIL import Image - from django.conf import settings from django.utils import timezone -from pyocr.tesseract import TesseractError - from paperless.db import GnuPG +from pyocr.tesseract import TesseractError +from pyocr.libtesseract.tesseract_raw import \ + TesseractError as OtherTesseractError from .models import Tag, Document, FileInfo -from .languages import ISO639 from .signals import ( - document_consumption_started, document_consumption_finished) + document_consumption_started, + document_consumption_finished +) +from .languages import ISO639 class OCRError(Exception): @@ -381,7 +379,7 @@ def image_to_string(args): try: orientation = ocr.detect_orientation(f, lang=lang) f = f.rotate(orientation["angle"], expand=1) - except TesseractError: + except (TesseractError, OtherTesseractError): pass return ocr.image_to_string(f, lang=lang) diff --git a/src/documents/tests/samples/no-text.png b/src/documents/tests/samples/no-text.png new file mode 100644 index 000000000..e78b22bfb Binary files /dev/null and b/src/documents/tests/samples/no-text.png differ diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 18bbab50f..f4c7039a9 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,7 +1,13 @@ -from django.test import TestCase +import os +from unittest import mock, skipIf + +import pyocr +from django.test import TestCase +from pyocr.libtesseract.tesseract_raw import \ + TesseractError as OtherTesseractError -from ..consumer import strip_excess_whitespace from ..models import FileInfo +from ..consumer import image_to_string, strip_excess_whitespace class TestAttributes(TestCase): @@ -304,6 +310,28 @@ class TestFieldPermutations(TestCase): template.format(**spec), **spec) +class FakeTesseract(object): + + @staticmethod + def can_detect_orientation(): + return True + + @staticmethod + def detect_orientation(file_handle, lang): + raise OtherTesseractError("arbitrary status", "message") + + @staticmethod + def image_to_string(file_handle, lang): + return "This is test text" + + +class FakePyOcr(object): + + @staticmethod + def get_available_tools(): + return [FakeTesseract] + + class TestOCR(TestCase): text_cases = [ @@ -317,6 +345,9 @@ class TestOCR(TestCase): "utf-8 строка с пробелами в конце" ) ] + + SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") + TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) def test_strip_excess_whitespace(self): for source, result in self.text_cases: @@ -330,3 +361,18 @@ class TestOCR(TestCase): actual_result ) ) + + @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") + @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES) + @mock.patch("documents.consumer.pyocr", FakePyOcr) + def test_image_to_string_with_text_free_page(self): + """ + This test is sort of silly, since it's really just reproducing an odd + exception thrown by pyocr when it encounters a page with no text. + Actually running this test against an installation of Tesseract results + in a segmentation fault rooted somewhere deep inside pyocr where I + don't care to dig. Regardless, if you run the consumer normally, + text-free pages are now handled correctly so long as we work around + this weird exception. + """ + image_to_string(["text.png", "en"])