mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Oh wow this works for azure, not chatgpt
This commit is contained in:
parent
3aeb45bf34
commit
eacafbcb36
2
Pipfile
2
Pipfile
@ -4,6 +4,7 @@ verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
azure-ai-formrecognizer = "*"
|
||||
dateparser = "~=1.2"
|
||||
# WARNING: django does not use semver.
|
||||
# Only patch versions are guaranteed to not introduce breaking changes.
|
||||
@ -35,6 +36,7 @@ langdetect = "*"
|
||||
mysqlclient = "*"
|
||||
nltk = "*"
|
||||
ocrmypdf = "~=15.4"
|
||||
openai = "*"
|
||||
pathvalidate = "*"
|
||||
pdf2image = "*"
|
||||
psycopg2 = "*"
|
||||
|
50
Pipfile.lock
generated
50
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "7bc15a3bbd521f85a8cdcc85be8adf7c942acb53c6d461199d7f8b1ef63ac651"
|
||||
"sha256": "3e824b6b9710b60ae118d2823d1f6e7a07040b2c00b2293155603d644a9d2607"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {},
|
||||
@ -46,6 +46,30 @@
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==4.0.3"
|
||||
},
|
||||
"azure-ai-formrecognizer": {
|
||||
"hashes": [
|
||||
"sha256:064803e0885bbe0429d1d282fc400123a5fc7f3baebb7f6ce30456450c08085e",
|
||||
"sha256:3ea6ab27536e05f7a52953c8884f9488b4015bfe8904c87a4b5a8961b0a73792"
|
||||
],
|
||||
"index": "pypi",
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==3.3.2"
|
||||
},
|
||||
"azure-common": {
|
||||
"hashes": [
|
||||
"sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3",
|
||||
"sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad"
|
||||
],
|
||||
"version": "==1.1.28"
|
||||
},
|
||||
"azure-core": {
|
||||
"hashes": [
|
||||
"sha256:3dae7962aad109610e68c9a7abb31d79720e1d982ddf61363038d175a5025e89",
|
||||
"sha256:6f3a7883ef184722f6bd997262eddaf80cfe7e5b3e0caaaf8db1695695893d35"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.30.0"
|
||||
},
|
||||
"billiard": {
|
||||
"hashes": [
|
||||
"sha256:07aa978b308f334ff8282bd4a746e681b3513db5c9a514cbdd810cbbdc19714d",
|
||||
@ -866,6 +890,13 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.3.5"
|
||||
},
|
||||
"isodate": {
|
||||
"hashes": [
|
||||
"sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96",
|
||||
"sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"joblib": {
|
||||
"hashes": [
|
||||
"sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1",
|
||||
@ -1067,6 +1098,14 @@
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==1.0.8"
|
||||
},
|
||||
"msrest": {
|
||||
"hashes": [
|
||||
"sha256:21120a810e1233e5e6cc7fe40b474eeb4ec6f757a15d7cf86702c369f9567c32",
|
||||
"sha256:6e7661f46f3afd88b75667b7187a92829924446c7ea1d169be8c4bb7eeb788b9"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==0.7.1"
|
||||
},
|
||||
"mysqlclient": {
|
||||
"hashes": [
|
||||
"sha256:329e4eec086a2336fe3541f1ce095d87a6f169d1cc8ba7b04ac68bcb234c9711",
|
||||
@ -1151,6 +1190,15 @@
|
||||
"markers": "python_version >= '3.9'",
|
||||
"version": "==15.4.4"
|
||||
},
|
||||
"openai": {
|
||||
"hashes": [
|
||||
"sha256:99c5d257d09ea6533d689d1cc77caa0ac679fa21efef8893d8b0832a86877f1b",
|
||||
"sha256:a54002c814e05222e413664f651b5916714e4700d041d5cf5724d3ae1a3e3481"
|
||||
],
|
||||
"index": "pypi",
|
||||
"markers": "python_full_version >= '3.7.1'",
|
||||
"version": "==1.12.0"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5",
|
||||
|
@ -297,6 +297,7 @@ INSTALLED_APPS = [
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
||||
"django.contrib.admin",
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
@ -1149,3 +1150,11 @@ EMAIL_SUBJECT_PREFIX: Final[str] = "[Paperless-ngx] "
|
||||
if DEBUG: # pragma: no cover
|
||||
EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend"
|
||||
EMAIL_FILE_PATH = BASE_DIR / "sent_emails"
|
||||
|
||||
###############################################################################
|
||||
# Remote Parser #
|
||||
###############################################################################
|
||||
|
||||
REMOTE_PARSER_ENGINE = os.getenv("PAPERLESS_REMOTE_PARSER_ENGINE")
|
||||
REMOTE_PARSER_API_KEY = os.getenv("PAPERLESS_REMOTE_PARSER_API_KEY")
|
||||
REMOTE_PARSER_ENDPOINT = os.getenv("PAPERLESS_REMOTE_PARSER_ENDPOINT")
|
||||
|
4
src/paperless_remote/__init__.py
Normal file
4
src/paperless_remote/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# this is here so that django finds the checks.
|
||||
from paperless_remote.checks import check_remote_parser_configured
|
||||
|
||||
__all__ = ["check_remote_parser_configured"]
|
14
src/paperless_remote/apps.py
Normal file
14
src/paperless_remote/apps.py
Normal file
@ -0,0 +1,14 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_remote.signals import remote_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessRemoteParserConfig(AppConfig):
|
||||
name = "paperless_remote"
|
||||
|
||||
def ready(self):
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(remote_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
25
src/paperless_remote/checks.py
Normal file
25
src/paperless_remote/checks.py
Normal file
@ -0,0 +1,25 @@
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import register
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs):
|
||||
if settings.REMOTE_PARSER_ENGINE and not settings.REMOTE_PARSER_API_KEY:
|
||||
return [
|
||||
Error(
|
||||
"No remote engine API key is configured.",
|
||||
),
|
||||
]
|
||||
|
||||
if (
|
||||
settings.REMOTE_PARSER_ENGINE == "azureaivision"
|
||||
and not settings.REMOTE_PARSER_ENDPOINT
|
||||
):
|
||||
return [
|
||||
Error(
|
||||
"Azure remote parser requires endpoint to be configured.",
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
126
src/paperless_remote/parsers.py
Normal file
126
src/paperless_remote/parsers.py
Normal file
@ -0,0 +1,126 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class RemoteEngineConfig:
|
||||
def __init__(self, engine: str, api_key: str, endpoint: Optional[str] = None):
|
||||
self.engine = engine
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
|
||||
def engine_is_valid(self):
|
||||
valid = self.engine in ["chatgpt", "azureaivision"] and self.api_key is not None
|
||||
if self.engine == "azureaivision":
|
||||
valid = valid and self.endpoint is not None
|
||||
return valid
|
||||
|
||||
|
||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
"""
|
||||
This parser uses a remote ocr engine to parse documents
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.remote"
|
||||
|
||||
def get_settings(self) -> RemoteEngineConfig:
|
||||
"""
|
||||
This parser uses the OCR configuration settings to parse documents
|
||||
"""
|
||||
return RemoteEngineConfig(
|
||||
engine=settings.REMOTE_PARSER_ENGINE,
|
||||
api_key=settings.REMOTE_PARSER_API_KEY,
|
||||
endpoint=settings.REMOTE_PARSER_ENDPOINT,
|
||||
)
|
||||
|
||||
def supported_mime_types(self):
|
||||
if self.settings.engine_is_valid():
|
||||
return [
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
"image/webp",
|
||||
]
|
||||
else:
|
||||
return []
|
||||
|
||||
def chatgpt_parse(
|
||||
self,
|
||||
file: Path,
|
||||
) -> Optional[str]:
|
||||
# does not work
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
api_key=self.settings.api_key,
|
||||
)
|
||||
assistants = client.beta.assistants.list()
|
||||
for assistant in assistants.data:
|
||||
if assistant.name == "Paperless-ngx Document Parser":
|
||||
assistant = assistant
|
||||
break
|
||||
if not assistant:
|
||||
assistant = client.beta.assistants.create(
|
||||
model="gpt-3.5-turbo",
|
||||
tools=[{"type": "code_interpreter"}],
|
||||
name="Paperless-ngx Document Parser",
|
||||
)
|
||||
|
||||
gpt_file = client.files.create(file=file, purpose="assistants")
|
||||
client.files.wait_for_processing(gpt_file.id)
|
||||
client.beta.assistants.update(assistant_id=assistant.id, files=[gpt_file.id])
|
||||
thread = client.beta.threads.create()
|
||||
client.beta.threads.messages.create(
|
||||
thread_id=thread.id,
|
||||
role="user",
|
||||
content="Output the text of the file",
|
||||
)
|
||||
client.beta.threads.runs.create(
|
||||
thread_id=thread,
|
||||
assistant_id=assistant.id,
|
||||
)
|
||||
response = client.beta.threads.messages.list(
|
||||
thread_id=thread.id,
|
||||
)
|
||||
self.text = response.data[0].content[0].text.value
|
||||
client.files.delete(gpt_file.id)
|
||||
|
||||
def azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
) -> Optional[str]:
|
||||
from azure.ai.formrecognizer import DocumentAnalysisClient
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
credential = AzureKeyCredential(self.settings.api_key)
|
||||
document_analysis_client = DocumentAnalysisClient(
|
||||
endpoint=self.settings.endpoint,
|
||||
credential=credential,
|
||||
)
|
||||
|
||||
with open(file, "rb") as f:
|
||||
poller = document_analysis_client.begin_analyze_document(
|
||||
"prebuilt-layout",
|
||||
document=f,
|
||||
)
|
||||
result = poller.result()
|
||||
|
||||
return result.content
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||
if not self.settings.engine_is_valid():
|
||||
self.log.warning(
|
||||
"No valid remote parser engine is configured, content will be empty.",
|
||||
)
|
||||
self.text = ""
|
||||
return
|
||||
elif self.settings.engine == "chatgpt":
|
||||
self.text = self.chatgpt_parse(document_path)
|
||||
elif self.settings.engine == "azureaivision":
|
||||
self.text = self.azure_ai_vision_parse(document_path)
|
18
src/paperless_remote/signals.py
Normal file
18
src/paperless_remote/signals.py
Normal file
@ -0,0 +1,18 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
|
||||
return RemoteDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def get_supported_mime_types():
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
|
||||
return RemoteDocumentParser(None).supported_mime_types()
|
||||
|
||||
|
||||
def remote_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 5,
|
||||
"mime_types": get_supported_mime_types(),
|
||||
}
|
0
src/paperless_remote/tests/__init__.py
Normal file
0
src/paperless_remote/tests/__init__.py
Normal file
BIN
src/paperless_remote/tests/samples/simple-digital.pdf
Normal file
BIN
src/paperless_remote/tests/samples/simple-digital.pdf
Normal file
Binary file not shown.
40
src/paperless_remote/tests/test_checks.py
Normal file
40
src/paperless_remote/tests/test_checks.py
Normal file
@ -0,0 +1,40 @@
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless_remote import check_remote_parser_configured
|
||||
|
||||
|
||||
class TestChecks(TestCase):
|
||||
@override_settings(REMOTE_PARSER_ENGINE=None)
|
||||
def test_no_engine(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 0)
|
||||
|
||||
@override_settings(REMOTE_PARSER_ENGINE="something")
|
||||
@override_settings(REMOTE_PARSER_API_KEY=None)
|
||||
def test_no_api_key(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 1)
|
||||
self.assertTrue(
|
||||
msgs[0].msg.startswith(
|
||||
"No remote engine API key is configured.",
|
||||
),
|
||||
)
|
||||
|
||||
@override_settings(REMOTE_PARSER_ENGINE="azureaivision")
|
||||
@override_settings(REMOTE_PARSER_API_KEY="somekey")
|
||||
@override_settings(REMOTE_PARSER_ENDPOINT=None)
|
||||
def test_azure_no_endpoint(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 1)
|
||||
self.assertTrue(
|
||||
msgs[0].msg.startswith(
|
||||
"Azure remote parser requires endpoint to be configured.",
|
||||
),
|
||||
)
|
||||
|
||||
@override_settings(REMOTE_PARSER_ENGINE="something")
|
||||
@override_settings(REMOTE_PARSER_API_KEY="somekey")
|
||||
def test_valid_configuration(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 0)
|
48
src/paperless_remote/tests/test_parser.py
Normal file
48
src/paperless_remote/tests/test_parser.py
Normal file
@ -0,0 +1,48 @@
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||
|
||||
def assertContainsStrings(self, content, strings):
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
indices = []
|
||||
for s in strings:
|
||||
if s in content:
|
||||
indices.append(content.index(s))
|
||||
else:
|
||||
self.fail(f"'{s}' is not in '{content}'")
|
||||
self.assertListEqual(indices, sorted(indices))
|
||||
|
||||
@mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document")
|
||||
def test_get_text_with_azure(self, mock_begin_analyze_document):
|
||||
result = mock.Mock()
|
||||
result.content = "This is a test document."
|
||||
mock_begin_analyze_document.return_value.result.return_value = result
|
||||
|
||||
with override_settings(
|
||||
REMOTE_PARSER_ENGINE="azureaivision",
|
||||
REMOTE_PARSER_API_KEY="somekey",
|
||||
REMOTE_PARSER_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
|
||||
):
|
||||
parser = RemoteDocumentParser(uuid.uuid4())
|
||||
parser.parse(
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
mock_begin_analyze_document.assert_called_once()
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.text.strip(),
|
||||
["This is a test document."],
|
||||
)
|
Loading…
x
Reference in New Issue
Block a user