mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00

I've broken out the OCR-specific code from the consumers and dumped it all into its own app, `paperless_tesseract`. This new app should serve as a sample of how to create one's own consumer for different file types. Documentation for how to do this isn't ready yet, but for the impatient: * Create a new app * containing a `parsers.py` for your parser modelled after `paperless_tesseract.parsers.RasterisedDocumentParser` * containing a `signals.py` with a handler moddelled after `paperless_tesseract.signals.ConsumerDeclaration` * connect the signal handler to `documents.signals.document_consumer_declaration` in `your_app.apps` * Install the app into Paperless by declaring `PAPERLESS_INSTALLED_APPS=your_app`. Additional apps should be separated with commas. * Restart the consumer
81 lines
2.4 KiB
Python
81 lines
2.4 KiB
Python
import os
|
||
from unittest import mock, skipIf
|
||
|
||
import pyocr
|
||
from django.test import TestCase
|
||
from pyocr.libtesseract.tesseract_raw import \
|
||
TesseractError as OtherTesseractError
|
||
|
||
from ..parsers import image_to_string, strip_excess_whitespace
|
||
|
||
|
||
class FakeTesseract(object):
|
||
|
||
@staticmethod
|
||
def can_detect_orientation():
|
||
return True
|
||
|
||
@staticmethod
|
||
def detect_orientation(file_handle, lang):
|
||
raise OtherTesseractError("arbitrary status", "message")
|
||
|
||
@staticmethod
|
||
def image_to_string(file_handle, lang):
|
||
return "This is test text"
|
||
|
||
|
||
class FakePyOcr(object):
|
||
|
||
@staticmethod
|
||
def get_available_tools():
|
||
return [FakeTesseract]
|
||
|
||
|
||
class TestOCR(TestCase):
|
||
|
||
text_cases = [
|
||
("simple string", "simple string"),
|
||
(
|
||
"simple newline\n testing string",
|
||
"simple newline\ntesting string"
|
||
),
|
||
(
|
||
"utf-8 строка с пробелами в конце ",
|
||
"utf-8 строка с пробелами в конце"
|
||
)
|
||
]
|
||
|
||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||
|
||
def test_strip_excess_whitespace(self):
|
||
for source, result in self.text_cases:
|
||
actual_result = strip_excess_whitespace(source)
|
||
self.assertEqual(
|
||
result,
|
||
actual_result,
|
||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||
source,
|
||
result,
|
||
actual_result
|
||
)
|
||
)
|
||
|
||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||
@mock.patch(
|
||
"paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH",
|
||
SAMPLE_FILES
|
||
)
|
||
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||
def test_image_to_string_with_text_free_page(self):
|
||
"""
|
||
This test is sort of silly, since it's really just reproducing an odd
|
||
exception thrown by pyocr when it encounters a page with no text.
|
||
Actually running this test against an installation of Tesseract results
|
||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||
don't care to dig. Regardless, if you run the consumer normally,
|
||
text-free pages are now handled correctly so long as we work around
|
||
this weird exception.
|
||
"""
|
||
image_to_string(["no-text.png", "en"])
|