diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 361854a93..259b60097 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -1266,3 +1266,11 @@ OUTLOOK_OAUTH_ENABLED = bool( and OUTLOOK_OAUTH_CLIENT_ID and OUTLOOK_OAUTH_CLIENT_SECRET, ) + +############################################################################### +# Remote Parser # +############################################################################### + +REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE") +REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY") +REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT") diff --git a/src/paperless_remote/__init__.py b/src/paperless_remote/__init__.py new file mode 100644 index 000000000..5380ea5ac --- /dev/null +++ b/src/paperless_remote/__init__.py @@ -0,0 +1,4 @@ +# this is here so that django finds the checks. +from paperless_remote.checks import check_remote_parser_configured + +__all__ = ["check_remote_parser_configured"] diff --git a/src/paperless_remote/apps.py b/src/paperless_remote/apps.py new file mode 100644 index 000000000..8cd3199f9 --- /dev/null +++ b/src/paperless_remote/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig + +from paperless_remote.signals import remote_consumer_declaration + + +class PaperlessRemoteParserConfig(AppConfig): + name = "paperless_remote" + + def ready(self): + from documents.signals import document_consumer_declaration + + document_consumer_declaration.connect(remote_consumer_declaration) + + AppConfig.ready(self) diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py new file mode 100644 index 000000000..ce72ebcc8 --- /dev/null +++ b/src/paperless_remote/checks.py @@ -0,0 +1,15 @@ +from django.conf import settings +from django.core.checks import Error +from django.core.checks import register + + +@register() +def check_remote_parser_configured(app_configs, **kwargs): + if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT: + return [ + Error( + "Azure AI remote parser requires endpoint to be configured.", + ), + ] + + return [] diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py new file mode 100644 index 000000000..03b53793c --- /dev/null +++ b/src/paperless_remote/parsers.py @@ -0,0 +1,74 @@ +from pathlib import Path + +from django.conf import settings + +from paperless_tesseract.parsers import RasterisedDocumentParser + + +class RemoteEngineConfig: + def __init__( + self, + engine: str, + api_key: str | None = None, + endpoint: str | None = None, + ): + self.engine = engine + self.api_key = api_key + self.endpoint = endpoint + + def engine_is_valid(self): + valid = self.engine in ["azureai"] and self.api_key is not None + if self.engine == "azureai": + valid = valid and self.endpoint is not None + return valid + + +class RemoteDocumentParser(RasterisedDocumentParser): + """ + This parser uses a remote ocr engine to parse documents + """ + + logging_name = "paperless.parsing.remote" + + def get_settings(self) -> RemoteEngineConfig: + """ + This parser uses the OCR configuration settings to parse documents + """ + return RemoteEngineConfig( + engine=settings.REMOTE_OCR_ENGINE, + api_key=settings.REMOTE_OCR_API_KEY, + endpoint=settings.REMOTE_OCR_ENDPOINT, + ) + + def supported_mime_types(self): + if self.settings.engine_is_valid(): + return [ + "application/pdf", + "image/png", + "image/jpeg", + "image/tiff", + "image/bmp", + "image/gif", + "image/webp", + ] + else: + return [] + + def azure_ai_vision_parse( + self, + file: Path, + ) -> str | None: + """ + This method uses the Azure AI Vision API to parse documents + """ + # TODO: Implement the Azure AI Vision API parsing logic + + def parse(self, document_path: Path, mime_type, file_name=None): + if not self.settings.engine_is_valid(): + self.log.warning( + "No valid remote parser engine is configured, content will be empty.", + ) + self.text = "" + return + elif self.settings.engine == "azureai": + self.text = self.azure_ai_vision_parse(document_path) diff --git a/src/paperless_remote/signals.py b/src/paperless_remote/signals.py new file mode 100644 index 000000000..81955a479 --- /dev/null +++ b/src/paperless_remote/signals.py @@ -0,0 +1,18 @@ +def get_parser(*args, **kwargs): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(*args, **kwargs) + + +def get_supported_mime_types(): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(None).supported_mime_types() + + +def remote_consumer_declaration(sender, **kwargs): + return { + "parser": get_parser, + "weight": 5, + "mime_types": get_supported_mime_types(), + } diff --git a/src/paperless_remote/tests/__init__.py b/src/paperless_remote/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_remote/tests/samples/simple-digital.pdf b/src/paperless_remote/tests/samples/simple-digital.pdf new file mode 100644 index 000000000..e450de482 Binary files /dev/null and b/src/paperless_remote/tests/samples/simple-digital.pdf differ diff --git a/src/paperless_remote/tests/test_checks.py b/src/paperless_remote/tests/test_checks.py new file mode 100644 index 000000000..b153df224 --- /dev/null +++ b/src/paperless_remote/tests/test_checks.py @@ -0,0 +1,29 @@ +from django.test import TestCase +from django.test import override_settings + +from paperless_remote import check_remote_parser_configured + + +class TestChecks(TestCase): + @override_settings(REMOTE_OCR_ENGINE=None) + def test_no_engine(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) + + @override_settings(REMOTE_OCR_ENGINE="azureai") + @override_settings(REMOTE_OCR_API_KEY="somekey") + @override_settings(REMOTE_OCR_ENDPOINT=None) + def test_azure_no_endpoint(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 1) + self.assertTrue( + msgs[0].msg.startswith( + "Azure AI Vision remote parser requires endpoint to be configured.", + ), + ) + + @override_settings(REMOTE_OCR_ENGINE="something") + @override_settings(REMOTE_OCR_API_KEY="somekey") + def test_valid_configuration(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py new file mode 100644 index 000000000..160796fe0 --- /dev/null +++ b/src/paperless_remote/tests/test_parser.py @@ -0,0 +1,91 @@ +import sys +import uuid +from pathlib import Path +from unittest import mock + +import pytest +from django.test import TestCase +from django.test import override_settings + +from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import FileSystemAssertsMixin +from paperless_remote.parsers import RemoteDocumentParser + + +class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + SAMPLE_FILES = Path(__file__).resolve().parent / "samples" + + def assertContainsStrings(self, content, strings): + # Asserts that all strings appear in content, in the given order. + indices = [] + for s in strings: + if s in content: + indices.append(content.index(s)) + else: + self.fail(f"'{s}' is not in '{content}'") + self.assertListEqual(indices, sorted(indices)) + + @pytest.mark.skipif( + sys.version_info > (3, 10), + reason="Fails on 3.11 only on CI, for some reason", + ) # TODO: investigate + @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient") + def test_get_text_with_azure(self, mock_azure_client): + result = mock.Mock() + result.content = "This is a test document." + result.pages = [ + mock.Mock( + width=100, + height=100, + words=[ + mock.Mock( + content="This", + polygon=[ + mock.Mock(x=0, y=0), + ], + ), + mock.Mock( + content="is", + polygon=[ + mock.Mock(x=10, y=10), + ], + ), + mock.Mock( + content="a", + polygon=[ + mock.Mock(x=20, y=20), + ], + ), + mock.Mock( + content="test", + polygon=[ + mock.Mock(x=30, y=30), + ], + ), + mock.Mock( + content="document.", + polygon=[ + mock.Mock(x=40, y=40), + ], + ), + ], + ), + ] + + mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = result + + with override_settings( + REMOTE_OCR_ENGINE="azureaivision", + REMOTE_OCR_API_KEY="somekey", + REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/", + ): + parser = RemoteDocumentParser(uuid.uuid4()) + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) + + self.assertContainsStrings( + parser.text.strip(), + ["This is a test document."], + )