mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
feat: refactor for pluggable consumers
I've broken out the OCR-specific code from the consumers and dumped it all into its own app, `paperless_tesseract`. This new app should serve as a sample of how to create one's own consumer for different file types. Documentation for how to do this isn't ready yet, but for the impatient: * Create a new app * containing a `parsers.py` for your parser modelled after `paperless_tesseract.parsers.RasterisedDocumentParser` * containing a `signals.py` with a handler moddelled after `paperless_tesseract.signals.ConsumerDeclaration` * connect the signal handler to `documents.signals.document_consumer_declaration` in `your_app.apps` * Install the app into Paperless by declaring `PAPERLESS_INSTALLED_APPS=your_app`. Additional apps should be separated with commas. * Restart the consumer
This commit is contained in:
Binary file not shown.
Before Width: | Height: | Size: 32 KiB |
@@ -1,13 +1,6 @@
|
||||
import os
|
||||
from unittest import mock, skipIf
|
||||
|
||||
import pyocr
|
||||
from django.test import TestCase
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from ..models import FileInfo
|
||||
from ..consumer import image_to_string, strip_excess_whitespace
|
||||
|
||||
|
||||
class TestAttributes(TestCase):
|
||||
@@ -310,69 +303,3 @@ class TestFieldPermutations(TestCase):
|
||||
template.format(**spec), **spec)
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise OtherTesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
return "This is test text"
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
|
||||
@mock.patch("documents.consumer.pyocr", FakePyOcr)
|
||||
def test_image_to_string_with_text_free_page(self):
|
||||
"""
|
||||
This test is sort of silly, since it's really just reproducing an odd
|
||||
exception thrown by pyocr when it encounters a page with no text.
|
||||
Actually running this test against an installation of Tesseract results
|
||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||
don't care to dig. Regardless, if you run the consumer normally,
|
||||
text-free pages are now handled correctly so long as we work around
|
||||
this weird exception.
|
||||
"""
|
||||
image_to_string(["no-text.png", "en"])
|
||||
|
Reference in New Issue
Block a user