mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
commit
50896f48db
@ -1,33 +1,31 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import itertools
|
||||
|
||||
import langdetect
|
||||
import os
|
||||
import re
|
||||
import uuid
|
||||
import shutil
|
||||
import hashlib
|
||||
import logging
|
||||
import datetime
|
||||
import tempfile
|
||||
import itertools
|
||||
import subprocess
|
||||
from multiprocessing.pool import Pool
|
||||
|
||||
import pyocr
|
||||
import shutil
|
||||
|
||||
import langdetect
|
||||
from PIL import Image
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from pyocr.tesseract import TesseractError
|
||||
|
||||
from paperless.db import GnuPG
|
||||
from pyocr.tesseract import TesseractError
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from .models import Tag, Document, FileInfo
|
||||
from .languages import ISO639
|
||||
from .signals import (
|
||||
document_consumption_started, document_consumption_finished)
|
||||
document_consumption_started,
|
||||
document_consumption_finished
|
||||
)
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
@ -381,7 +379,7 @@ def image_to_string(args):
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except TesseractError:
|
||||
except (TesseractError, OtherTesseractError):
|
||||
pass
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
|
||||
|
BIN
src/documents/tests/samples/no-text.png
Normal file
BIN
src/documents/tests/samples/no-text.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 32 KiB |
@ -1,7 +1,13 @@
|
||||
from django.test import TestCase
|
||||
import os
|
||||
from unittest import mock, skipIf
|
||||
|
||||
import pyocr
|
||||
from django.test import TestCase
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from ..consumer import strip_excess_whitespace
|
||||
from ..models import FileInfo
|
||||
from ..consumer import image_to_string, strip_excess_whitespace
|
||||
|
||||
|
||||
class TestAttributes(TestCase):
|
||||
@ -304,6 +310,28 @@ class TestFieldPermutations(TestCase):
|
||||
template.format(**spec), **spec)
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise OtherTesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
return "This is test text"
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
@ -318,6 +346,9 @@ class TestOCR(TestCase):
|
||||
)
|
||||
]
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
@ -330,3 +361,18 @@ class TestOCR(TestCase):
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
|
||||
@mock.patch("documents.consumer.pyocr", FakePyOcr)
|
||||
def test_image_to_string_with_text_free_page(self):
|
||||
"""
|
||||
This test is sort of silly, since it's really just reproducing an odd
|
||||
exception thrown by pyocr when it encounters a page with no text.
|
||||
Actually running this test against an installation of Tesseract results
|
||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||
don't care to dig. Regardless, if you run the consumer normally,
|
||||
text-free pages are now handled correctly so long as we work around
|
||||
this weird exception.
|
||||
"""
|
||||
image_to_string(["text.png", "en"])
|
||||
|
Loading…
x
Reference in New Issue
Block a user