mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-23 12:58:18 -05:00
Ok, restart implementing this with just azure
[ci skip]
This commit is contained in:
parent
62f46b706e
commit
c796b0ed0d
@ -1266,3 +1266,11 @@ OUTLOOK_OAUTH_ENABLED = bool(
|
||||
and OUTLOOK_OAUTH_CLIENT_ID
|
||||
and OUTLOOK_OAUTH_CLIENT_SECRET,
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
# Remote Parser #
|
||||
###############################################################################
|
||||
|
||||
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
|
||||
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
|
||||
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
|
||||
|
4
src/paperless_remote/__init__.py
Normal file
4
src/paperless_remote/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# this is here so that django finds the checks.
|
||||
from paperless_remote.checks import check_remote_parser_configured
|
||||
|
||||
__all__ = ["check_remote_parser_configured"]
|
14
src/paperless_remote/apps.py
Normal file
14
src/paperless_remote/apps.py
Normal file
@ -0,0 +1,14 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_remote.signals import remote_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessRemoteParserConfig(AppConfig):
|
||||
name = "paperless_remote"
|
||||
|
||||
def ready(self):
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(remote_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
15
src/paperless_remote/checks.py
Normal file
15
src/paperless_remote/checks.py
Normal file
@ -0,0 +1,15 @@
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import register
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs):
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
|
||||
return [
|
||||
Error(
|
||||
"Azure AI remote parser requires endpoint to be configured.",
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
74
src/paperless_remote/parsers.py
Normal file
74
src/paperless_remote/parsers.py
Normal file
@ -0,0 +1,74 @@
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class RemoteEngineConfig:
|
||||
def __init__(
|
||||
self,
|
||||
engine: str,
|
||||
api_key: str | None = None,
|
||||
endpoint: str | None = None,
|
||||
):
|
||||
self.engine = engine
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
|
||||
def engine_is_valid(self):
|
||||
valid = self.engine in ["azureai"] and self.api_key is not None
|
||||
if self.engine == "azureai":
|
||||
valid = valid and self.endpoint is not None
|
||||
return valid
|
||||
|
||||
|
||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
"""
|
||||
This parser uses a remote ocr engine to parse documents
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.remote"
|
||||
|
||||
def get_settings(self) -> RemoteEngineConfig:
|
||||
"""
|
||||
This parser uses the OCR configuration settings to parse documents
|
||||
"""
|
||||
return RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
|
||||
def supported_mime_types(self):
|
||||
if self.settings.engine_is_valid():
|
||||
return [
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
||||
def azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
) -> str | None:
|
||||
"""
|
||||
This method uses the Azure AI Vision API to parse documents
|
||||
"""
|
||||
# TODO: Implement the Azure AI Vision API parsing logic
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||
if not self.settings.engine_is_valid():
|
||||
self.log.warning(
|
||||
"No valid remote parser engine is configured, content will be empty.",
|
||||
)
|
||||
self.text = ""
|
||||
return
|
||||
elif self.settings.engine == "azureai":
|
||||
self.text = self.azure_ai_vision_parse(document_path)
|
18
src/paperless_remote/signals.py
Normal file
18
src/paperless_remote/signals.py
Normal file
@ -0,0 +1,18 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
|
||||
return RemoteDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def get_supported_mime_types():
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
|
||||
return RemoteDocumentParser(None).supported_mime_types()
|
||||
|
||||
|
||||
def remote_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 5,
|
||||
"mime_types": get_supported_mime_types(),
|
||||
}
|
0
src/paperless_remote/tests/__init__.py
Normal file
0
src/paperless_remote/tests/__init__.py
Normal file
BIN
src/paperless_remote/tests/samples/simple-digital.pdf
Normal file
BIN
src/paperless_remote/tests/samples/simple-digital.pdf
Normal file
Binary file not shown.
29
src/paperless_remote/tests/test_checks.py
Normal file
29
src/paperless_remote/tests/test_checks.py
Normal file
@ -0,0 +1,29 @@
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless_remote import check_remote_parser_configured
|
||||
|
||||
|
||||
class TestChecks(TestCase):
|
||||
@override_settings(REMOTE_OCR_ENGINE=None)
|
||||
def test_no_engine(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 0)
|
||||
|
||||
@override_settings(REMOTE_OCR_ENGINE="azureai")
|
||||
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
||||
@override_settings(REMOTE_OCR_ENDPOINT=None)
|
||||
def test_azure_no_endpoint(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 1)
|
||||
self.assertTrue(
|
||||
msgs[0].msg.startswith(
|
||||
"Azure AI Vision remote parser requires endpoint to be configured.",
|
||||
),
|
||||
)
|
||||
|
||||
@override_settings(REMOTE_OCR_ENGINE="something")
|
||||
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
||||
def test_valid_configuration(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 0)
|
91
src/paperless_remote/tests/test_parser.py
Normal file
91
src/paperless_remote/tests/test_parser.py
Normal file
@ -0,0 +1,91 @@
|
||||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||
|
||||
def assertContainsStrings(self, content, strings):
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
indices = []
|
||||
for s in strings:
|
||||
if s in content:
|
||||
indices.append(content.index(s))
|
||||
else:
|
||||
self.fail(f"'{s}' is not in '{content}'")
|
||||
self.assertListEqual(indices, sorted(indices))
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.version_info > (3, 10),
|
||||
reason="Fails on 3.11 only on CI, for some reason",
|
||||
) # TODO: investigate
|
||||
@mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
|
||||
def test_get_text_with_azure(self, mock_azure_client):
|
||||
result = mock.Mock()
|
||||
result.content = "This is a test document."
|
||||
result.pages = [
|
||||
mock.Mock(
|
||||
width=100,
|
||||
height=100,
|
||||
words=[
|
||||
mock.Mock(
|
||||
content="This",
|
||||
polygon=[
|
||||
mock.Mock(x=0, y=0),
|
||||
],
|
||||
),
|
||||
mock.Mock(
|
||||
content="is",
|
||||
polygon=[
|
||||
mock.Mock(x=10, y=10),
|
||||
],
|
||||
),
|
||||
mock.Mock(
|
||||
content="a",
|
||||
polygon=[
|
||||
mock.Mock(x=20, y=20),
|
||||
],
|
||||
),
|
||||
mock.Mock(
|
||||
content="test",
|
||||
polygon=[
|
||||
mock.Mock(x=30, y=30),
|
||||
],
|
||||
),
|
||||
mock.Mock(
|
||||
content="document.",
|
||||
polygon=[
|
||||
mock.Mock(x=40, y=40),
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = result
|
||||
|
||||
with override_settings(
|
||||
REMOTE_OCR_ENGINE="azureaivision",
|
||||
REMOTE_OCR_API_KEY="somekey",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
|
||||
):
|
||||
parser = RemoteDocumentParser(uuid.uuid4())
|
||||
parser.parse(
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.text.strip(),
|
||||
["This is a test document."],
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user