Fix for #154

* Added a test with a faked pyocr and tesseract * Added a catch for pyocr's *other* TesseractError
2026-02-09 23:49:29 -06:00 · 2016-11-27 15:06:45 +00:00
parent b88e0fd902
commit 18495ce9da
3 changed files with 65 additions and 21 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -1,33 +1,31 @@
 import datetime
 import hashlib
 import logging
 import tempfile
 import uuid
 from multiprocessing.pool import Pool
 import itertools
 import langdetect
 import os
 import re
 import uuid
 import shutil
 import hashlib
 import logging
 import datetime
 import tempfile
 import itertools
 import subprocess
 from multiprocessing.pool import Pool
 import pyocr
-import shutil
+import langdetect
 from PIL import Image
 from django.conf import settings
 from django.utils import timezone
 from pyocr.tesseract import TesseractError
 from paperless.db import GnuPG
 from pyocr.tesseract import TesseractError
 from pyocr.libtesseract.tesseract_raw import \
    TesseractError as OtherTesseractError
 from .models import Tag, Document, FileInfo
 from .languages import ISO639
 from .signals import (
-    document_consumption_started, document_consumption_finished)
+    document_consumption_started,
    document_consumption_finished
 )
 from .languages import ISO639
 class OCRError(Exception):
@@ -381,7 +379,7 @@ def image_to_string(args):
            try:
                orientation = ocr.detect_orientation(f, lang=lang)
                f = f.rotate(orientation["angle"], expand=1)
-            except TesseractError:
+            except (TesseractError, OtherTesseractError):
                pass
        return ocr.image_to_string(f, lang=lang)
--- a/src/documents/tests/samples/no-text.png
+++ b/src/documents/tests/samples/no-text.png
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,7 +1,13 @@
-from django.test import TestCase
+import os
 from unittest import mock, skipIf
 import pyocr
 from django.test import TestCase
 from pyocr.libtesseract.tesseract_raw import \
    TesseractError as OtherTesseractError
 from ..consumer import strip_excess_whitespace
 from ..models import FileInfo
 from ..consumer import image_to_string, strip_excess_whitespace
 class TestAttributes(TestCase):
@@ -304,6 +310,28 @@ class TestFieldPermutations(TestCase):
                            template.format(**spec), **spec)
 class FakeTesseract(object):
    @staticmethod
    def can_detect_orientation():
        return True
    @staticmethod
    def detect_orientation(file_handle, lang):
        raise OtherTesseractError("arbitrary status", "message")
    @staticmethod
    def image_to_string(file_handle, lang):
        return "This is test text"
 class FakePyOcr(object):
    @staticmethod
    def get_available_tools():
        return [FakeTesseract]
 class TestOCR(TestCase):
    text_cases = [
@@ -317,6 +345,9 @@ class TestOCR(TestCase):
            "utf-8 строка с пробелами в конце"
        )
    ]
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
    def test_strip_excess_whitespace(self):
        for source, result in self.text_cases:
@@ -330,3 +361,18 @@ class TestOCR(TestCase):
                    actual_result
                )
            )
    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
    @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
    @mock.patch("documents.consumer.pyocr", FakePyOcr)
    def test_image_to_string_with_text_free_page(self):
        """
        This test is sort of silly, since it's really just reproducing an odd
        exception thrown by pyocr when it encounters a page with no text.
        Actually running this test against an installation of Tesseract results
        in a segmentation fault rooted somewhere deep inside pyocr where I
        don't care to dig.  Regardless, if you run the consumer normally,
        text-free pages are now handled correctly so long as we work around
        this weird exception.
        """
        image_to_string(["text.png", "en"])