diff --git a/Pipfile b/Pipfile index b5656439f..91e4abcc9 100644 --- a/Pipfile +++ b/Pipfile @@ -4,6 +4,7 @@ verify_ssl = true name = "pypi" [packages] +azure-ai-formrecognizer = "*" dateparser = "~=1.2" # WARNING: django does not use semver. # Only patch versions are guaranteed to not introduce breaking changes. @@ -35,6 +36,7 @@ langdetect = "*" mysqlclient = "*" nltk = "*" ocrmypdf = "~=15.4" +openai = "*" pathvalidate = "*" pdf2image = "*" psycopg2 = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 2f81583c6..248c70bd6 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "7bc15a3bbd521f85a8cdcc85be8adf7c942acb53c6d461199d7f8b1ef63ac651" + "sha256": "3e824b6b9710b60ae118d2823d1f6e7a07040b2c00b2293155603d644a9d2607" }, "pipfile-spec": 6, "requires": {}, @@ -46,6 +46,30 @@ "markers": "python_version >= '3.7'", "version": "==4.0.3" }, + "azure-ai-formrecognizer": { + "hashes": [ + "sha256:064803e0885bbe0429d1d282fc400123a5fc7f3baebb7f6ce30456450c08085e", + "sha256:3ea6ab27536e05f7a52953c8884f9488b4015bfe8904c87a4b5a8961b0a73792" + ], + "index": "pypi", + "markers": "python_version >= '3.7'", + "version": "==3.3.2" + }, + "azure-common": { + "hashes": [ + "sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3", + "sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad" + ], + "version": "==1.1.28" + }, + "azure-core": { + "hashes": [ + "sha256:3dae7962aad109610e68c9a7abb31d79720e1d982ddf61363038d175a5025e89", + "sha256:6f3a7883ef184722f6bd997262eddaf80cfe7e5b3e0caaaf8db1695695893d35" + ], + "markers": "python_version >= '3.7'", + "version": "==1.30.0" + }, "billiard": { "hashes": [ "sha256:07aa978b308f334ff8282bd4a746e681b3513db5c9a514cbdd810cbbdc19714d", @@ -866,6 +890,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.3.5" }, + "isodate": { + "hashes": [ + "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96", + "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9" + ], + "version": "==0.6.1" + }, "joblib": { "hashes": [ "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1", @@ -1067,6 +1098,14 @@ "markers": "python_version >= '3.8'", "version": "==1.0.8" }, + "msrest": { + "hashes": [ + "sha256:21120a810e1233e5e6cc7fe40b474eeb4ec6f757a15d7cf86702c369f9567c32", + "sha256:6e7661f46f3afd88b75667b7187a92829924446c7ea1d169be8c4bb7eeb788b9" + ], + "markers": "python_version >= '3.6'", + "version": "==0.7.1" + }, "mysqlclient": { "hashes": [ "sha256:329e4eec086a2336fe3541f1ce095d87a6f169d1cc8ba7b04ac68bcb234c9711", @@ -1151,6 +1190,15 @@ "markers": "python_version >= '3.9'", "version": "==15.4.4" }, + "openai": { + "hashes": [ + "sha256:99c5d257d09ea6533d689d1cc77caa0ac679fa21efef8893d8b0832a86877f1b", + "sha256:a54002c814e05222e413664f651b5916714e4700d041d5cf5724d3ae1a3e3481" + ], + "index": "pypi", + "markers": "python_full_version >= '3.7.1'", + "version": "==1.12.0" + }, "packaging": { "hashes": [ "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5", diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 77adb6bbf..98121cb15 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -297,6 +297,7 @@ INSTALLED_APPS = [ "paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_text.apps.PaperlessTextConfig", "paperless_mail.apps.PaperlessMailConfig", + "paperless_remote.apps.PaperlessRemoteParserConfig", "django.contrib.admin", "rest_framework", "rest_framework.authtoken", @@ -1149,3 +1150,11 @@ EMAIL_SUBJECT_PREFIX: Final[str] = "[Paperless-ngx] " if DEBUG: # pragma: no cover EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend" EMAIL_FILE_PATH = BASE_DIR / "sent_emails" + +############################################################################### +# Remote Parser # +############################################################################### + +REMOTE_PARSER_ENGINE = os.getenv("PAPERLESS_REMOTE_PARSER_ENGINE") +REMOTE_PARSER_API_KEY = os.getenv("PAPERLESS_REMOTE_PARSER_API_KEY") +REMOTE_PARSER_ENDPOINT = os.getenv("PAPERLESS_REMOTE_PARSER_ENDPOINT") diff --git a/src/paperless_remote/__init__.py b/src/paperless_remote/__init__.py new file mode 100644 index 000000000..5380ea5ac --- /dev/null +++ b/src/paperless_remote/__init__.py @@ -0,0 +1,4 @@ +# this is here so that django finds the checks. +from paperless_remote.checks import check_remote_parser_configured + +__all__ = ["check_remote_parser_configured"] diff --git a/src/paperless_remote/apps.py b/src/paperless_remote/apps.py new file mode 100644 index 000000000..8cd3199f9 --- /dev/null +++ b/src/paperless_remote/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig + +from paperless_remote.signals import remote_consumer_declaration + + +class PaperlessRemoteParserConfig(AppConfig): + name = "paperless_remote" + + def ready(self): + from documents.signals import document_consumer_declaration + + document_consumer_declaration.connect(remote_consumer_declaration) + + AppConfig.ready(self) diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py new file mode 100644 index 000000000..39ba4d305 --- /dev/null +++ b/src/paperless_remote/checks.py @@ -0,0 +1,25 @@ +from django.conf import settings +from django.core.checks import Error +from django.core.checks import register + + +@register() +def check_remote_parser_configured(app_configs, **kwargs): + if settings.REMOTE_PARSER_ENGINE and not settings.REMOTE_PARSER_API_KEY: + return [ + Error( + "No remote engine API key is configured.", + ), + ] + + if ( + settings.REMOTE_PARSER_ENGINE == "azureaivision" + and not settings.REMOTE_PARSER_ENDPOINT + ): + return [ + Error( + "Azure remote parser requires endpoint to be configured.", + ), + ] + + return [] diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py new file mode 100644 index 000000000..852d15d1e --- /dev/null +++ b/src/paperless_remote/parsers.py @@ -0,0 +1,126 @@ +from pathlib import Path +from typing import Optional + +from django.conf import settings + +from paperless_tesseract.parsers import RasterisedDocumentParser + + +class RemoteEngineConfig: + def __init__(self, engine: str, api_key: str, endpoint: Optional[str] = None): + self.engine = engine + self.api_key = api_key + self.endpoint = endpoint + + def engine_is_valid(self): + valid = self.engine in ["chatgpt", "azureaivision"] and self.api_key is not None + if self.engine == "azureaivision": + valid = valid and self.endpoint is not None + return valid + + +class RemoteDocumentParser(RasterisedDocumentParser): + """ + This parser uses a remote ocr engine to parse documents + """ + + logging_name = "paperless.parsing.remote" + + def get_settings(self) -> RemoteEngineConfig: + """ + This parser uses the OCR configuration settings to parse documents + """ + return RemoteEngineConfig( + engine=settings.REMOTE_PARSER_ENGINE, + api_key=settings.REMOTE_PARSER_API_KEY, + endpoint=settings.REMOTE_PARSER_ENDPOINT, + ) + + def supported_mime_types(self): + if self.settings.engine_is_valid(): + return [ + "application/pdf", + "image/png", + "image/jpeg", + "image/tiff", + "image/bmp", + "image/gif", + "image/webp", + ] + else: + return [] + + def chatgpt_parse( + self, + file: Path, + ) -> Optional[str]: + # does not work + from openai import OpenAI + + client = OpenAI( + api_key=self.settings.api_key, + ) + assistants = client.beta.assistants.list() + for assistant in assistants.data: + if assistant.name == "Paperless-ngx Document Parser": + assistant = assistant + break + if not assistant: + assistant = client.beta.assistants.create( + model="gpt-3.5-turbo", + tools=[{"type": "code_interpreter"}], + name="Paperless-ngx Document Parser", + ) + + gpt_file = client.files.create(file=file, purpose="assistants") + client.files.wait_for_processing(gpt_file.id) + client.beta.assistants.update(assistant_id=assistant.id, files=[gpt_file.id]) + thread = client.beta.threads.create() + client.beta.threads.messages.create( + thread_id=thread.id, + role="user", + content="Output the text of the file", + ) + client.beta.threads.runs.create( + thread_id=thread, + assistant_id=assistant.id, + ) + response = client.beta.threads.messages.list( + thread_id=thread.id, + ) + self.text = response.data[0].content[0].text.value + client.files.delete(gpt_file.id) + + def azure_ai_vision_parse( + self, + file: Path, + ) -> Optional[str]: + from azure.ai.formrecognizer import DocumentAnalysisClient + from azure.core.credentials import AzureKeyCredential + + credential = AzureKeyCredential(self.settings.api_key) + document_analysis_client = DocumentAnalysisClient( + endpoint=self.settings.endpoint, + credential=credential, + ) + + with open(file, "rb") as f: + poller = document_analysis_client.begin_analyze_document( + "prebuilt-layout", + document=f, + ) + result = poller.result() + + return result.content + + def parse(self, document_path: Path, mime_type, file_name=None): + if not self.settings.engine_is_valid(): + self.log.warning( + "No valid remote parser engine is configured, content will be empty.", + ) + self.text = "" + return + elif self.settings.engine == "chatgpt": + self.text = self.chatgpt_parse(document_path) + elif self.settings.engine == "azureaivision": + self.text = self.azure_ai_vision_parse(document_path) diff --git a/src/paperless_remote/signals.py b/src/paperless_remote/signals.py new file mode 100644 index 000000000..81955a479 --- /dev/null +++ b/src/paperless_remote/signals.py @@ -0,0 +1,18 @@ +def get_parser(*args, **kwargs): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(*args, **kwargs) + + +def get_supported_mime_types(): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(None).supported_mime_types() + + +def remote_consumer_declaration(sender, **kwargs): + return { + "parser": get_parser, + "weight": 5, + "mime_types": get_supported_mime_types(), + } diff --git a/src/paperless_remote/tests/__init__.py b/src/paperless_remote/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_remote/tests/samples/simple-digital.pdf b/src/paperless_remote/tests/samples/simple-digital.pdf new file mode 100644 index 000000000..e450de482 Binary files /dev/null and b/src/paperless_remote/tests/samples/simple-digital.pdf differ diff --git a/src/paperless_remote/tests/test_checks.py b/src/paperless_remote/tests/test_checks.py new file mode 100644 index 000000000..3810f1dcd --- /dev/null +++ b/src/paperless_remote/tests/test_checks.py @@ -0,0 +1,40 @@ +from django.test import TestCase +from django.test import override_settings + +from paperless_remote import check_remote_parser_configured + + +class TestChecks(TestCase): + @override_settings(REMOTE_PARSER_ENGINE=None) + def test_no_engine(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) + + @override_settings(REMOTE_PARSER_ENGINE="something") + @override_settings(REMOTE_PARSER_API_KEY=None) + def test_no_api_key(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 1) + self.assertTrue( + msgs[0].msg.startswith( + "No remote engine API key is configured.", + ), + ) + + @override_settings(REMOTE_PARSER_ENGINE="azureaivision") + @override_settings(REMOTE_PARSER_API_KEY="somekey") + @override_settings(REMOTE_PARSER_ENDPOINT=None) + def test_azure_no_endpoint(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 1) + self.assertTrue( + msgs[0].msg.startswith( + "Azure remote parser requires endpoint to be configured.", + ), + ) + + @override_settings(REMOTE_PARSER_ENGINE="something") + @override_settings(REMOTE_PARSER_API_KEY="somekey") + def test_valid_configuration(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py new file mode 100644 index 000000000..3706b20e3 --- /dev/null +++ b/src/paperless_remote/tests/test_parser.py @@ -0,0 +1,48 @@ +import uuid +from pathlib import Path +from unittest import mock + +from django.test import TestCase +from django.test import override_settings + +from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import FileSystemAssertsMixin +from paperless_remote.parsers import RemoteDocumentParser + + +class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + SAMPLE_FILES = Path(__file__).resolve().parent / "samples" + + def assertContainsStrings(self, content, strings): + # Asserts that all strings appear in content, in the given order. + indices = [] + for s in strings: + if s in content: + indices.append(content.index(s)) + else: + self.fail(f"'{s}' is not in '{content}'") + self.assertListEqual(indices, sorted(indices)) + + @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") + def test_get_text_with_azure(self, mock_begin_analyze_document): + result = mock.Mock() + result.content = "This is a test document." + mock_begin_analyze_document.return_value.result.return_value = result + + with override_settings( + REMOTE_PARSER_ENGINE="azureaivision", + REMOTE_PARSER_API_KEY="somekey", + REMOTE_PARSER_ENDPOINT="https://endpoint.cognitiveservices.azure.com/", + ): + parser = RemoteDocumentParser(uuid.uuid4()) + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) + + mock_begin_analyze_document.assert_called_once() + + self.assertContainsStrings( + parser.text.strip(), + ["This is a test document."], + )