From eacafbcb364bb9d2487efcbd100a5a0e4d7a75f9 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Mon, 26 Feb 2024 15:45:54 -0800 Subject: [PATCH] Oh wow this works for azure, not chatgpt --- Pipfile | 2 + Pipfile.lock | 50 ++++++- src/paperless/settings.py | 9 ++ src/paperless_remote/__init__.py | 4 + src/paperless_remote/apps.py | 14 ++ src/paperless_remote/checks.py | 25 ++++ src/paperless_remote/parsers.py | 126 ++++++++++++++++++ src/paperless_remote/signals.py | 18 +++ src/paperless_remote/tests/__init__.py | 0 .../tests/samples/simple-digital.pdf | Bin 0 -> 22926 bytes src/paperless_remote/tests/test_checks.py | 40 ++++++ src/paperless_remote/tests/test_parser.py | 48 +++++++ 12 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 src/paperless_remote/__init__.py create mode 100644 src/paperless_remote/apps.py create mode 100644 src/paperless_remote/checks.py create mode 100644 src/paperless_remote/parsers.py create mode 100644 src/paperless_remote/signals.py create mode 100644 src/paperless_remote/tests/__init__.py create mode 100644 src/paperless_remote/tests/samples/simple-digital.pdf create mode 100644 src/paperless_remote/tests/test_checks.py create mode 100644 src/paperless_remote/tests/test_parser.py diff --git a/Pipfile b/Pipfile index b5656439f..91e4abcc9 100644 --- a/Pipfile +++ b/Pipfile @@ -4,6 +4,7 @@ verify_ssl = true name = "pypi" [packages] +azure-ai-formrecognizer = "*" dateparser = "~=1.2" # WARNING: django does not use semver. # Only patch versions are guaranteed to not introduce breaking changes. @@ -35,6 +36,7 @@ langdetect = "*" mysqlclient = "*" nltk = "*" ocrmypdf = "~=15.4" +openai = "*" pathvalidate = "*" pdf2image = "*" psycopg2 = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 2f81583c6..248c70bd6 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "7bc15a3bbd521f85a8cdcc85be8adf7c942acb53c6d461199d7f8b1ef63ac651" + "sha256": "3e824b6b9710b60ae118d2823d1f6e7a07040b2c00b2293155603d644a9d2607" }, "pipfile-spec": 6, "requires": {}, @@ -46,6 +46,30 @@ "markers": "python_version >= '3.7'", "version": "==4.0.3" }, + "azure-ai-formrecognizer": { + "hashes": [ + "sha256:064803e0885bbe0429d1d282fc400123a5fc7f3baebb7f6ce30456450c08085e", + "sha256:3ea6ab27536e05f7a52953c8884f9488b4015bfe8904c87a4b5a8961b0a73792" + ], + "index": "pypi", + "markers": "python_version >= '3.7'", + "version": "==3.3.2" + }, + "azure-common": { + "hashes": [ + "sha256:4ac0cd3214e36b6a1b6a442686722a5d8cc449603aa833f3f0f40bda836704a3", + "sha256:5c12d3dcf4ec20599ca6b0d3e09e86e146353d443e7fcc050c9a19c1f9df20ad" + ], + "version": "==1.1.28" + }, + "azure-core": { + "hashes": [ + "sha256:3dae7962aad109610e68c9a7abb31d79720e1d982ddf61363038d175a5025e89", + "sha256:6f3a7883ef184722f6bd997262eddaf80cfe7e5b3e0caaaf8db1695695893d35" + ], + "markers": "python_version >= '3.7'", + "version": "==1.30.0" + }, "billiard": { "hashes": [ "sha256:07aa978b308f334ff8282bd4a746e681b3513db5c9a514cbdd810cbbdc19714d", @@ -866,6 +890,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==0.3.5" }, + "isodate": { + "hashes": [ + "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96", + "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9" + ], + "version": "==0.6.1" + }, "joblib": { "hashes": [ "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1", @@ -1067,6 +1098,14 @@ "markers": "python_version >= '3.8'", "version": "==1.0.8" }, + "msrest": { + "hashes": [ + "sha256:21120a810e1233e5e6cc7fe40b474eeb4ec6f757a15d7cf86702c369f9567c32", + "sha256:6e7661f46f3afd88b75667b7187a92829924446c7ea1d169be8c4bb7eeb788b9" + ], + "markers": "python_version >= '3.6'", + "version": "==0.7.1" + }, "mysqlclient": { "hashes": [ "sha256:329e4eec086a2336fe3541f1ce095d87a6f169d1cc8ba7b04ac68bcb234c9711", @@ -1151,6 +1190,15 @@ "markers": "python_version >= '3.9'", "version": "==15.4.4" }, + "openai": { + "hashes": [ + "sha256:99c5d257d09ea6533d689d1cc77caa0ac679fa21efef8893d8b0832a86877f1b", + "sha256:a54002c814e05222e413664f651b5916714e4700d041d5cf5724d3ae1a3e3481" + ], + "index": "pypi", + "markers": "python_full_version >= '3.7.1'", + "version": "==1.12.0" + }, "packaging": { "hashes": [ "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5", diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 77adb6bbf..98121cb15 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -297,6 +297,7 @@ INSTALLED_APPS = [ "paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_text.apps.PaperlessTextConfig", "paperless_mail.apps.PaperlessMailConfig", + "paperless_remote.apps.PaperlessRemoteParserConfig", "django.contrib.admin", "rest_framework", "rest_framework.authtoken", @@ -1149,3 +1150,11 @@ EMAIL_SUBJECT_PREFIX: Final[str] = "[Paperless-ngx] " if DEBUG: # pragma: no cover EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend" EMAIL_FILE_PATH = BASE_DIR / "sent_emails" + +############################################################################### +# Remote Parser # +############################################################################### + +REMOTE_PARSER_ENGINE = os.getenv("PAPERLESS_REMOTE_PARSER_ENGINE") +REMOTE_PARSER_API_KEY = os.getenv("PAPERLESS_REMOTE_PARSER_API_KEY") +REMOTE_PARSER_ENDPOINT = os.getenv("PAPERLESS_REMOTE_PARSER_ENDPOINT") diff --git a/src/paperless_remote/__init__.py b/src/paperless_remote/__init__.py new file mode 100644 index 000000000..5380ea5ac --- /dev/null +++ b/src/paperless_remote/__init__.py @@ -0,0 +1,4 @@ +# this is here so that django finds the checks. +from paperless_remote.checks import check_remote_parser_configured + +__all__ = ["check_remote_parser_configured"] diff --git a/src/paperless_remote/apps.py b/src/paperless_remote/apps.py new file mode 100644 index 000000000..8cd3199f9 --- /dev/null +++ b/src/paperless_remote/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig + +from paperless_remote.signals import remote_consumer_declaration + + +class PaperlessRemoteParserConfig(AppConfig): + name = "paperless_remote" + + def ready(self): + from documents.signals import document_consumer_declaration + + document_consumer_declaration.connect(remote_consumer_declaration) + + AppConfig.ready(self) diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py new file mode 100644 index 000000000..39ba4d305 --- /dev/null +++ b/src/paperless_remote/checks.py @@ -0,0 +1,25 @@ +from django.conf import settings +from django.core.checks import Error +from django.core.checks import register + + +@register() +def check_remote_parser_configured(app_configs, **kwargs): + if settings.REMOTE_PARSER_ENGINE and not settings.REMOTE_PARSER_API_KEY: + return [ + Error( + "No remote engine API key is configured.", + ), + ] + + if ( + settings.REMOTE_PARSER_ENGINE == "azureaivision" + and not settings.REMOTE_PARSER_ENDPOINT + ): + return [ + Error( + "Azure remote parser requires endpoint to be configured.", + ), + ] + + return [] diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py new file mode 100644 index 000000000..852d15d1e --- /dev/null +++ b/src/paperless_remote/parsers.py @@ -0,0 +1,126 @@ +from pathlib import Path +from typing import Optional + +from django.conf import settings + +from paperless_tesseract.parsers import RasterisedDocumentParser + + +class RemoteEngineConfig: + def __init__(self, engine: str, api_key: str, endpoint: Optional[str] = None): + self.engine = engine + self.api_key = api_key + self.endpoint = endpoint + + def engine_is_valid(self): + valid = self.engine in ["chatgpt", "azureaivision"] and self.api_key is not None + if self.engine == "azureaivision": + valid = valid and self.endpoint is not None + return valid + + +class RemoteDocumentParser(RasterisedDocumentParser): + """ + This parser uses a remote ocr engine to parse documents + """ + + logging_name = "paperless.parsing.remote" + + def get_settings(self) -> RemoteEngineConfig: + """ + This parser uses the OCR configuration settings to parse documents + """ + return RemoteEngineConfig( + engine=settings.REMOTE_PARSER_ENGINE, + api_key=settings.REMOTE_PARSER_API_KEY, + endpoint=settings.REMOTE_PARSER_ENDPOINT, + ) + + def supported_mime_types(self): + if self.settings.engine_is_valid(): + return [ + "application/pdf", + "image/png", + "image/jpeg", + "image/tiff", + "image/bmp", + "image/gif", + "image/webp", + ] + else: + return [] + + def chatgpt_parse( + self, + file: Path, + ) -> Optional[str]: + # does not work + from openai import OpenAI + + client = OpenAI( + api_key=self.settings.api_key, + ) + assistants = client.beta.assistants.list() + for assistant in assistants.data: + if assistant.name == "Paperless-ngx Document Parser": + assistant = assistant + break + if not assistant: + assistant = client.beta.assistants.create( + model="gpt-3.5-turbo", + tools=[{"type": "code_interpreter"}], + name="Paperless-ngx Document Parser", + ) + + gpt_file = client.files.create(file=file, purpose="assistants") + client.files.wait_for_processing(gpt_file.id) + client.beta.assistants.update(assistant_id=assistant.id, files=[gpt_file.id]) + thread = client.beta.threads.create() + client.beta.threads.messages.create( + thread_id=thread.id, + role="user", + content="Output the text of the file", + ) + client.beta.threads.runs.create( + thread_id=thread, + assistant_id=assistant.id, + ) + response = client.beta.threads.messages.list( + thread_id=thread.id, + ) + self.text = response.data[0].content[0].text.value + client.files.delete(gpt_file.id) + + def azure_ai_vision_parse( + self, + file: Path, + ) -> Optional[str]: + from azure.ai.formrecognizer import DocumentAnalysisClient + from azure.core.credentials import AzureKeyCredential + + credential = AzureKeyCredential(self.settings.api_key) + document_analysis_client = DocumentAnalysisClient( + endpoint=self.settings.endpoint, + credential=credential, + ) + + with open(file, "rb") as f: + poller = document_analysis_client.begin_analyze_document( + "prebuilt-layout", + document=f, + ) + result = poller.result() + + return result.content + + def parse(self, document_path: Path, mime_type, file_name=None): + if not self.settings.engine_is_valid(): + self.log.warning( + "No valid remote parser engine is configured, content will be empty.", + ) + self.text = "" + return + elif self.settings.engine == "chatgpt": + self.text = self.chatgpt_parse(document_path) + elif self.settings.engine == "azureaivision": + self.text = self.azure_ai_vision_parse(document_path) diff --git a/src/paperless_remote/signals.py b/src/paperless_remote/signals.py new file mode 100644 index 000000000..81955a479 --- /dev/null +++ b/src/paperless_remote/signals.py @@ -0,0 +1,18 @@ +def get_parser(*args, **kwargs): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(*args, **kwargs) + + +def get_supported_mime_types(): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(None).supported_mime_types() + + +def remote_consumer_declaration(sender, **kwargs): + return { + "parser": get_parser, + "weight": 5, + "mime_types": get_supported_mime_types(), + } diff --git a/src/paperless_remote/tests/__init__.py b/src/paperless_remote/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_remote/tests/samples/simple-digital.pdf b/src/paperless_remote/tests/samples/simple-digital.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e450de48269ce43785b8344c63e233a1794abae6 GIT binary patch literal 22926 zcmeFZ1ymeg@;^!l!6mrE;Lb3(yL)iAVQ_bM2@U~*y9EgZNJ4OTf?IHc27(3mH{|=> z-S7V7z4P{+J?EYO***+?`*z*Bb*rj-daCNvG^&!)EFe~HWSZ{c?w0P)-Fe9D05*W5 znGLd_AW#wFVCiNB;DGk10i~_&+#oJMX**Llh$IB;Xbuq;Ms{^`ftcDOdu6l44>H?6H;@?p^vKWPs#P#ZGbM{;Bcf{BGr+ELKNlvYYRNcNgtX+7@aLXT zMQN!S?3XnMOGXzd6?Y;rsx^sOB+DXSS48V9%8C_*Nre0Ge09*fJ;tB)nym>uKSOw1 z2i!o0IGFz_FtqiwM&zfZJvBhw+)rnJ_woEU1@Qha3iwk&AOMJsm!0je>e%A*b|c=( zS#^}IgXn*zCa)u*nWXP(wA20EkL1j%VyEC?M)$S`K=_(QzmCRCLHrFiECjSQtYn`iKg zf}%nOaWK%_&+Ku&A#j>Q@-?@j>#2p9dZv4QKhun z=@em(Dge&env$D{x9Q_-*cI_>U>>Rgrg4#rb67eijW{P8;mu->2nuC92$yD~)|^om zof)g{JNi%po%qS2uXL^$$;LVc720v6ksjPB{pbm!yHQ(d{s&oogF>puBi3^YH8K8~ ztf=^&Z>QNYfr%PP%}Ba_X=avrD9bVAkH*pka_wzWhja;v5}TSXTYZnCH!OGA` z3&Wr_z7-7B5)oa9ALHmvT?5AkgZZZC23wcJ>T-OElbRVKU0r;Baq_`7Pq-kT3Z{JZ znzD>tk*w~s6hTUEMXXn7y`Gwr?fjkxs;nIJ_~l-gs<>$-h<Ro53Nw-;(BpU?f_z^C3`oT3wrR`6@gqyKrECgMXzc67xJHqs zAT-Dx8^>$LdmKT)E37b`Q9HMosc9RLS$SU}H%%K8sPn~!;@wJl8+r3Ni~|WNKE`!R z=<|F3*4*42fu`oqj{85Inim%J8su5@xg8h26nNi<@6U2i3+$78s?@4}*VfUtWWkk6 zeAe{ldtn!>Qb4X=udArd3&K0rj2b-D!=Pmdh8w@li!_F%!*}lAmIHJV5!u@`n11Hu z#F}Fagcv7kuQ4S`TnuB$$LG*7l+ct^QSXK;nPa<~;%{0m9&|Yq?448ky<0xS-kd^R z=`@)^j=-TZt1p0$iI!&iVt%=D96Ou<>au%fn$^mpvHOmuK3obBkAk|UuHVvh2G0bh zVd#_TTdGX6Jv;Kv{=TJ2sA4{=8zx zVGa>A?xEGeV@B7Swbkd+ z`Yz5K(Xo}_Tt4>o8W?%ftQ37A^?FYCHR{9eQ0jBvmGBcLV z7USBIYAT_SguJkPyK>eTf=DHgI?IA7lk=OMias-*WM_{oKsStX;f1tbxPT*rG)H@JdR-qiMbg%YftI!VPiy zZR^}EJtn@&S8k+jFr_tRn+KzvT{naNdgcjyzj@^-Cw4W{vM74GX3aG zA8%4J&>|DQ4h1z-uCB}oY#P?Uy|GYrt+1K%w)kn}x`2wFUsXPfXf!%W(eEb!UUN;={~aG}&ptOzqXF$UaFB(W2)RzJSXYod?!X>MwuK0cJ@kv?_Z)Wq0~* zGOg&X#OHioX*4tz8_S6BMI3fc-aPx9SV>#!LJ6SP0Y&o|8J_vzMoHtuuMdn&y(V1R zK3q=dZ`GNZv1=&=LdVU94vAbHoVU;;EGI@!=NH-SOr^m`dwB(Y2hn07Nsh@#q^8!b zog){pP5B33E|Gl)J?KO2>I$`2g4eMdGHjsV|+;o9-(THn7OA24?NE{GgWdW$|4i-A%#Om9y@vU~Gu zf_#`FM|CtNfv^t=Vv#jFC!namky9zp<6{Wl8^lNw%}gptv8L=)vGr7JU$w5d0xfO@ zO`Hb6y3uS@5GCb|O^vME)$Um$SdSk5l-cGS^vgLtmnCt;I?6gFaT1e^Kycs3X~0)8 z#@Ld>x3EadXZ=fh*Sy_b)t4{;X-ds?7e@fOpdJ~0__})@Tj!i~EmyhR zMIaQ*Gq&r}C!;53!hbq4PU6b(^$S5J$HvCwPj~NadHT7-)7`vvRWj>x(94OQT=S)QiT2GGZDghdV$l(WmRmJFIsV5<7Q&=*@b_>z0*3@5vvn##f4+iAtctFB4n-0 zwal!;jo%)3jY*cxR)?YS9BGm&4jLFzMgE%Zds9|GHgwt$G;dYa(PPb(`E&Rb*J(?S z_{*t4;H1me92Saibz9)2`y4aeaEOjeuoRE9t$Nj#&&W$5r|$}8Eg86;nv zY>xf(Dh_F-t`;Xnc;xxNV!5UqHMfq0Mn~fae3Tz`4iS{D8W|NQbS!2j1 zFH<*9e-3L`+3Q8VSR14DPu+Z%TC5kTag`HZQN$w}xA&Ek)xR!ydk{s_4Go>SZMbzn zL_!NQZ`ynqXsi}XRqLZv8&^~H(aUdJUdVX!Wb3r=2iHsE=MP+Ky3f0aysXerYl3sR z5~I>gd=d9wF?6mJ6Nf#spfYIGs^W}}v(3s@?XPuWV*1IXJ)gFtnL^COB6#`zXTs8I zjVGs@;mP!J+c-_!s)%4fjqBG1$)mOSoCb_f^J1>2>yVNogSmjmRHb3NgQCNlWix+| z$sg8^D~-jq)%N)JT%m5PZNtW+B61^Nnib_);Fa7z$&cGqY6z0urs1<5oo6tjMwHBh zLT5Uxy+ebokmIfyM`}Yfy!<3ZTCpOuLq1}?{DPAe)JsR}5mWMY)a^u$BWv&snUh|Z z@w5RenTyjt7*A*MW61mAPy2v&IL4vgK6m^sl*=XlPajm@ruzpjPB@b8&6E8!ZhOcJ zGVt6uW;rN|yGwpNKIh=k;PasC_;_3KzGd&CgJ!hc@&E-d)yGrgbLMkskg>WWifmaok5-jbv%Y8R!_ZR!a*c(d+@u|ReL8tA^_wR4s z(=t^yBG?}G5mo>c1}UuHA&WLhsuxu1Sd>%h>@wQo;x$#s>+9K^UCE#8x!0)JX4ePV z1sCD*+67yq^suZo1ogxA!0I2<=;p7$hP*h#OSg2PPx(e#C$Lc>?`kEZI6BfLi$SJO zjJt)G59v2|fqDU8FJ>>^UfrMHs7BPnfo3i=cG%(VWP5TAq^)XV21 z>6;rtTl(aT+79zB=gbYc&^^nVu<_A&2Xe&RJh8r#PMxAtj2=F z)%fPs)dVAg;B8O)`^^>5hk(N#67s$PyzgN`w1&2-_?TSfoYwM!!g0AwmnPyNNUxEU zGdjQ_KG?fTY-8g)N^Je5hwqTkvrHD?oUyNz02zSybtl5ozu44-iuMSpv>)lG6f4(H zCxLhD77fEc*}vi;X4N!6E_&~1A$gs;Yve||em_#RSFR3h6Yjd>=9CVaFK(>8{5wA! zKjaDD@8=nN@71b1-k<$AYee6E82M|*{myK$S7Dr2yYe_m6F!X?Kf@arw%S@Lck%O4NiFh)aHP0`uSyE} zzVqrj;R7Q2We;?xXxoRme}!`>0F&jz_Z z+4Z{~oZjNYI|?`TY}u5vk?2_o@$&Ar9*%ca``_lrUe4KB*;HE_UKP0{o(jKXVAguS zm??JaKRl`d!4R~I&*nvK@8{E#-!86{)6FQCt^CTf1*~W8O+HjJB&3v1?$@@eqAvj# zm4QzUQCjSS%UFWQaZ+DoVm5ZeGN8b*u$csVpm6H0!J?!S61UqK9D1U)Ta4)gZU=`a zU77grJahcUgf4TnJ1nQvW4@gP6LXRM6#^`!0#*5iP7e2R2vXlZDxQ&NV}PPF(@o>dC__(T55|`~t+14O#brDBA9x;pGlC zIt@R7J(&;skAWXW9<%BL2Mzt0YUGt3VXHfjly1aZ{T4%F3{r4IP9N?n$87sn305g* z7{!MgE!4V!RL?bXL?rn!f&C2#-is$IQOC-DS+C0ASC!-G1LXpdJ-XDFi0=5hTEAkP zv}<{5TvjT1XZ;GuxUBxJsX~b71ikv8*)L#9xQV1jh`h9Y+0p zbeVCkVQ>QJ4%^~<31eR8ncCGnt2xE^h0nnDQmmJWO-5eB|}CM)u=DRO#rwrrNJlnQ3GDQG*v`~ES$4E(xmH{pAOhk;6c~hK@@-&o1!BJ87^L2x~HgP5i7gB1bZi0mJ$O00#VQur~ zfSb@JH+r2cP*wTMj95A|U)|XFO)E;A=)!tYHa4t`*Tv&{yordhX1yQmk|wAt5%I)d zBNU>~X;9)}m=@!^yXE2bz)a03_R(GxS~8NNg@;U2MYQs82FveyQQ|F1oD#;+u#KBA z{v3$%W5!b|uu(*W3r*+MV&dlKlDiFqC__R}Vrk6=VpQPNtRU>V_{t^UiL*!jZ!czE zq|C;eS?-P3oeq1I-=8E-Ccu4S=bMN-NSK^es(qNGzG-kr5pu3dAP&XG3D%?F$ly(j zMqvh1=gQC0P+_~*_I#=DUshU>3ya;z>AIE2q7Xlk#yW~gzl26iMH!1Y|86GU+i>iS* zEN(?Bh%T-N$i;cDD8GDF`|8Xd?hKb$mrDJLDU4l=mf|JCiZNuP@~W92WyFODR1NPD zn7gh0EiRuXSd+h(mvd4M@1Cr`(<~?2;(QA*I4(8L*1O#G-@OI zDRO9j@%?0Hyb(7=Rmp^duPm~ZB^O*!*FUID4S!fc!V^pU0dY|%+!^y&JGCszlWIu2 zUh3S%QDHY?`YuOPV<_fF$wP_^dJsDNE=kmsd zBmG7+@)hcp)4s7n$s>+9{)o(NZfRrZEFbt^=*&#zyT8;qZmsoD5o9%3)&*@6r^ zm%bGXWtg1My4uO;e(bc1|6!wEj?}Bpzk&3b{9y3HGWM%}Q=yo}*OM1qdkZ`CPkW*O%DR3o%(%G_(6lTi~BdHmdw zjg=M}x*YEEBctv534OShxFd$-QFhG91mK3co1$-;JgM~o967uFLhrH;@SPz6OY&P` zr_AG;t`E5xE5Na@ltOWsJ?hN4)Rhp@VN;T8q1}S4+=8iguB9Lf|4}aewQu~e6tFKN zoYkbO*60LG(^0#$>IkX*6b5X2J&C`c86sp2at_%`-{AG|Q20|F=4knSURw^mWbvvz zR#uoU3d3Gie&Go)iXj?X$DUUzXOS&AI(Y5*y;;J$Iq9&gL7}sOBHrN$kWJ!3phqwv6>k} zay@rFY5dW+v!nd=>zH+E9`7C8>Z=W)}?i- zd;v3|?}A;cuAVvPhv+fq)dJdjD$9IfM+pYV^=bGypcaT7$$t`|JF@zEJzdHWM^gX z>vwzR5s>Bk=~F$pRddetz{x{!?!=O-|ny^XIrdD z2V@1aoNqLZ<`NW#H<+*9@NUDc$#j{E-$+Iek;fj6izKqU{`ox@mtp@Y{%wRL` z`oPTn8N)gHpf`d!JFi=6m&Uukv#8}xTxAj~k4Gs^#v_jt-^JS980*?D>Xp7`MVN8&@n+X(`cc64x)gt&rT ztexB(p_w1(p|Ytx1Sl@{v@t1Kn?YPm-K-rQ)FCd`7RXPjAxl>PC$s^j6c=~&(r4k~ z^wXGK3qV&`N3 zf5=@KCJj4u1E9GpzZb)9nAb_X7=v{ww6&X3a{A>c;~5#g z2n27yt~OOjgO%i(C}J0VHW*W0r^uTxs{5ju(-z)2C0J=o>_}p@9tgf5AX-d-FuZBd z$qLEa`kvp9BKUf<*Y>#2XzTmNeBYN=zRMTS=K<=kG}R=?Zw1{C8;JoFb9EE7c3my8 zZ$&z9e~ND7So+~|*`ynkb$DJN7b;H4zjZf6`fy{+!Zh#lW5#l)h%;uAlVw(OGux|P z!U%MfG}p6;>mrt2Hb6XUKA=CVnn${WvMI!|g)zeQi;Cl1urBS4ZZt1(Y{YUib4+Ne z=&+%GXZio4Gx? zj)sdnJUN4+>7Q*77cx-W+T6{`Ri6jyINVUXaA#VCWXl50t0r7G({3ri} z5Ksf+(6f|uM zlz>VMozfNXTj;-DKb-#%P#M7Sw+t|d>$hzG_bK4_nVK3}hPQ`1dMZ4EZl2IT&jvSH z&xW9nAppi4O8Kb#pdfnW*CEj%h>Y(L8R<;`@cHx~yxTp0p4?+xJld@y>_5cWjyBE; zRojxW!dxwP`*nj&`C+=KzMK7wJ@?8nH1Gxu2f@4xLL~@}ZE&Mtn4iG_LC1j~#vV;p^V6MRt~AjvP#y%j1EWc96Oe>Y zI!&G5bYebw%Co3(=P z^oDtP3WF+hcj>pu2cyA_aW?fh1GoPye-mchbEiF1p~05##4< zqOdrk&rv&`-F5m?kbRNl(3I{-81)mq>n+|<9tpAaLj9Is%_mh}BEiL@G8URg2^Vu*f1zhM!SL3?Q-x zj{sZ;u<9c000INg_~H1(SQD8x(Y?hZDI5pze?`I;0O;Y6Ln-rR>4~DlzZbyKBk{wQ zh|3hX(PIxqxD;%t3cJBFhDGO3se&|+QzdX!aW$ULh@GoGcY9_NqL&{tPV}U zkB`J&H|NC_Mz-wwb`0XhU=32~DqA=Ef?6F^xvuwx%pr()-QtSU52+2+vrBv3=nFYn zkYi`}F`^*yYGnU9(iKP$O(fKEJ?+@m3o`%#*v)h-bH#Co`+)A)wRnu)f^tM<0*4$d zwT4LzM;h|1Gt5NF3GfB81@T!JTS!Ers4Rs!CNd%6POV_jY z*G^(zs9IhMBL+&oq{P7tel6WYfrTma()u;3BpxMxQY5`74#g-y9y9edQID?V^Fqvt z5Gx5c06(TSrvK~x*IBPAdJxTUPGC+DPY6J9UJ>aME#l0SD-alfONSQ~zB+klu0h0z zofyV)o!&H`22};_Ong&FQ*=`VktkZhVg6wOSs_`Gg=+mf?RQesST--t zkH! zuxCm4xZ#)YMJeS*r{ci_prnNj3E$2+81xuUz<>?QVaYL zGY|Pmf!4c7H0C;u;5UlzZBmZ+AnmzW*MF3}r8ZbNS4Z^IXR zySAP0VP-k}#VtK5>qMoN=XHI#jGftUe_5u$?gP zE`cthbtco~<48nIkR(=8@PPCt4Kg;(YZpn}LcEDYE9H+g{Fp+o1A1PX;edkErAKJD zu~jgKqdxVV_Go>_H3K>a@rt*oWK#=MwNbXwRAbFWW%YAv_rKPr=qT18O-+-iCikt_Z_-+guRMNQ)`rSGsX7T>uBbS&*m@FWlJdv% z9~?grtRqpAK<4ZjQ6pm8bW;P9<}`J*--7I#w+=kpWOHG@M&wJ*mSLq#OR^or zAF$tKbrJDS6qFB;%%y0jZl|ev)BdJmpc#icT(m%Kp1uVKGa5%KsZea9Ed7-o!zd0= z9)>0xOGd81{M9dpGKSO?A9;?F&`Alx{8-gK1{HeO6saMEA^aiQEg)$Kx{3@mK)z9e zU65R=UN}|Ekzb!*U*=kJT7Xg-Q>aj=P$nR)EvYT7Ei)(U8Fk3GjMjwNgy5#`=IR#X z#^ko|rtj8#M(;x@L?wjK3e!r^O47>M%G!$5iq?wViti8bhx3Q^C-P^#m%CBB(Yukp zQ8?57fv_vH+yA5Nhw*rOiE+_t{-e|j4dzsFNWm{vsds7`G!=L=uWh7+B+v72)Vs@1 zra5iUKPBqPzc1ldTPzov&YEtXmYFV_)}E%>quaySBZr539t_<=#e&4L#d5`RI)gex zI)giNIuj^jDdH(oDIyCJ3sMT=3!(}_3)0r{y6L;&x?Lk)L|~xqqw1sH#u7*)Q=w1+ zi|C6`i;#<0ix`SnikONpz=&XGFfy1MOaW#9vw*R|2w?IJzy`qvxKFo_C$=(nl^~Eq zOwo&$5gQ>HCK)anHkme=A{jRsI~ia7r8cEn($Yh-d{XoP>Hbi`w%dZcZ{bmU}2CL;lqXI5?QVB!F78xrjB zOY;u7<~yZ4Wjy8CCdGq8NeWL2kC953%9IM1O6!g7&F#(U&7%pSNumj*$ty`Od0P^- z#XijS*7Pj`^AvN9&RgBK4|esiS|m)VsiB&Qn$emWnvpXpGx0MyGkIJgTuEHX=Ww=2 zwk)xWj0+m6qjpS$Bt=xRCYPwQf8hwE+Yh-()=G}qnMMb^{R zS=Qy503jm7D_twS!X3gx!h^#75ApXLx3ssy2=Sr8p)nF+5)l${-9g>a-5K4HRIyaK zR5?_6MPWtpMd3v;;2?0y2B;q;o+chhvY7pnE;u!7g^>0oEEd?yeckD+av&VSx`Q~^7T@Ia18{ZjZ8&De58SEKE7;qa@8w40684MbH zGmtV!HP~t7|5EL|yzSF-p1sd9M^~-n&=I#|@qK$8Z$oQs_B;31=bej9^zZRo${VB5 z`(^cp!`9(;Td#kDZ>DdqZ}_dir7;;QMrvfTOqfi(OlW_4e|&!qT_RlsUFr_63P^=i z1x1Btj3xJ0a7wVoTgyA(_~(h4H=$Rfw{;70%R-Aq3wn!D3q{LG3t`KPmaP`FmV=f# zFBva7uN|*6uQ~6)Q;KVPzn^~Me(ZjGe$sxQ{SN(t{Xl*)*T1giulKGOu6(W|uE|cm z{%YO0?YfL1A5>Yl+`<2zwm!GUv)!<9xrwozu%WU8Tm$Ynt{tr%?X-74ra#g=B0MOlK(lsF~2B3!6eM2&ZK&cuv4#7tCOsgqBAn2K13m8KSV&>ilX9W!%HRESF+jz zY6DUO#`N;^n)E8~Glx|@PceNBPus_T;L zBESfij;noZzwA$r71lqb15k(ktn}WmPy)3>Ph-Y zMo5uGwY>BO{!%j6#sLm7GXfY{Dp+kO_W(XRoCch_XX#e8ySft!g>#Ze(aKS6c!t=^ zg`J$paiO&G>iIt=nXRiois~`5glfrF7IIBBSxbDB`N+56yTG%MOCT1jsi>-`rKm8X zI3hQq!mP)v^fOE+T&GAUZJwppxmLFpQtN2u``ybK&soe_*O|rH)!EWH@Py#R?L_>9 z@r3ro;e_Hu<~y)Q5cL2S305IiG?ow6Ec9!w6AKCJ8&(<3sJy|O-86#K^tU=G zSCq9>QAy@0o5?9}rxQ6--BQkyW8Vrg>H>6WzA<}i{`$OGyva1kHORtHjMIwKkP?n%tyZa~KZ$7VW9?v#WX)v_{vk0bJo#!;rUa$9 zvRJ>^w^*jQtk_r~@XZ5D=v&j2^|Yk76DgoH=Om<*sN}&nd@Qb7rH0#0BSIOL>UF6AnQQD+vTM0APVbV0F7^Jsm(q(953TGsJHKbjkYNNHM z*;L6{bf#r3VQrjIHB!P{P*rVL@w1BLU3PwUr9p+CSfl>+=U*wm3V${Isz!blE+M5T zrP(XlD@P+rV?-lUBDvKfpEhKl%8)9R>Yi$-S;!b!@uecOqP(KsCf}xT4>ZF*BRxYl zBTz9_;i%bO{){V{C?PW?voy0Lvsv$*Ui1ReC!0?(HupA{GZr&i)1Rj&_7wNT_mKDC z_b&F@_CR~OdzRD0(*iS*GiB5GGwl^e)`?tSxTv`}xtO_@xVkvixR5z*I9aWi#tG)U z%1>TxG|(z1DNt#q2ZU9}y({TWX>MoK|aL3dbZSdK}R z>DB%ht#GZFS+4iKdi8otdW#DL_IUP~_7HopJ>7xm0ri34f&78iLBPT70`TM9$H4tH zw|Tc7w`sQ#Hv@NZcQrR;Hyif_cZWU2-Im$w!um3W*{#{s*_qi-vqDwGC0XUZ(w8j` z4Tq%5gv*fSJRf=?S|L**8X>R{T`QoKy4A1Mrj@((S!<-vsgLoQ$NAum;LYR>|IO!{ zl^f8_x0{lixSQ4+@J;;<(@i_76Z}v3@9=x@c<^NKnD9T~%g~5Wo6w|DEl}U0aidK{ zFh>+dR76-rI7FC6SVu@joJZtDG@zBCrJ)8A6~yYqO2%o#jl?d+cEwJ{Dih{#?6DJp zGC_LW3+#Oa^+Z##GjS}jLa|h_9I;~rc|3<8GOhrJ7Mr_8IL9t)O6x*%CwAD$DQClw-gLxi`NzrB@&(@2!;rw}P{Rl7g**Bh-`z!^p!iZ+TegSXOlc z8pQQ#btbB&-!s2&sWPi_sTz9Es)wedq{E`;-yp5Wp+8kM^ZtDmhJL@!j-IwogYN3b z+>Zga_ucUJ-8NJXMb=>O1{mAXb*XfrwA*69V!`5@#Uh^Nur-r?rR}~6u~n9xzLmb? zBE)yjW7gyAMGa|h{Hk*2==|sm>k?~s^?dbU_3B~4a>z1|Z?RCl&{w1Gm4%gl;T7Sr zw!VkDd;8n63#JPJbZP`61aAZ?1pG+0NViC{NZQDN$VhY)^i_-k^jb13kz)6Oi^Ru* zVu=@#qhx#hoIX~E<7E=Q@ftD|GPp9XGQfU2Me@FElDN);ildbzgm(eQSv^5IPvTUGT$n zfBne0r)XfKMJBIbB0(yFJXc3WS%pyLxeET6(-`xZ%NX|<#Telj?pV8#n~}It1@smv zY!qem!f2<(r-jU`_2MRgz(0NC!Nu9ynYUfD<#SMDt zleMXiPaXGwK0qztAYVd^LX1#sODrwK0YM1kh6EL%9!>#%A3*>?25tvV`-L$Q8Cn2t z3uXxVI%*OM2x7-4Fw#AR}D` ze+AQ?(X{yVn3XtZ;79q7?rx%PHg0lm`e#mOc4t1Q>=9JbtbNpdOtci+=8E8vxO6&D zCFr(xq!z38rnb6vqSm|iyw<-KyLPx%!1>I%(0Rc*-g(`b&zZ}4*%{;eN{@Yi@<#Xu z*9Pf#iH-X2mEQ$6j(UMTjx>o_aM;t>Ke5EI*s&S0tErb1bY$lhscDgEE9e$zBB(8B zwP-zPR}>6nm*gpFkZ7!_`KZHbAvBtF?leNOKFMcH8E=^1RHgn*CrQfAGDurWzN7bM zxE{ldpfZy=i~m9vCHy9^cwP1}shp~k0T50j1ub~!Wul=dp)cWsux)t{`7ko3GHE`2 z(TQ-ed}97Y<<99Yd~QhuQeso0`bxpV%tH5``#qgK<$?dg;j7Z3H!^4mJ|uA>KEJG{ zCOV785~|CJrW&VGr`}IJpPHItniQBiDr=W}%n{2I8y^4a^!4>1Z;~?w7_ed8x1(rI zt6geVs%ar=;bS3XVYz)j(vZ`rptcTW0Pg zr@hgB>&bUyw35}5;}_u<+-XdAfcn3pd=;UWGQ)*lsyU8Zax2y!ud z=&^WKBKlFZrsF-J4f=%r$)tEqv(tj2QT3y0&G>t^wqp%b+jZG>zx8doA-M&)*5pGv zY~@^G?~~1(14qDQ0u%KV<;fGLOZUL%5W zgGC)Epn<39QCL-O_vI(#n+0EqLDkZSrOG>5Z(VQ8Yp?5DWYx}B1v;kcn0n4tg#~8$ z-@SVqfnAZ8F}h|JdgP{~j$<7dWj#{G)Kilquy&F7k9m2H~Xl~NVw6+)HC zl?s(L8VTC%HP5)RxrVqrxYoJ6x#GAaxPZ3Ywm)oXY~pPFZ4+j-W-exxX96m^r%&dr zXB($`XL72WHF&fxs~s+x7j?tyxh*Be$V7Y}ey)8>v*g+Tu}`z7wCBG+wQsw3yH~b9 zH?}^Clrtr?6Ttez;78-AE5_J+P5_Y?(KHbWu^CY`(E?E)(F*s^TuXr(uayhGtLwW{ zr5(q-K>vr!7o{k5_{=~!VXD^`9hSFKS?)U)PG zn~Nzc8dKA&XCJMLGUgR+x$Q0-gcl5!m*xbg?mmT&Q!bc|yL_Q}Us6$018Lf_Wh&78 zIFmoq%=P|L)2ABu((`?(?@|j=-7&)W2}V_o^MNNndL8XIxof#MxZ6L+es20qY+r9L z!a8C2rp4TY;aKgW#5K*q(8t@lIj7dJbh%Eb_}2YzS+R^Cq63XSTGnU6kaFtL*a}ULK7dI_{G11uj4C@GrWj?#It< zRO1^TwFrCP52JOVEud`>KjU$6$ZiMuflvAlVfup>f=)%dM89>+1k*hy<{9Lnam-vr zZOrhjzuw;cTB3i2-jl&X3R68`lpO6&r9J7FXjN{kJ`K4*DDBWMrwZsJ5vn zsIIBrQ(a06FNrNs9s!7X`b<{S}vR-n)7RlwRk z_%iD#VI}Yp=3MkRxbt~C!Z!?7!7rz4%e;c4BGm$iLb-e^Y{G&M+Z0{{x2ih@bKPf= z=NOS>tinryJ*Tp_NjpVF38TV9ejksm97BA?ZM$5*@AiC)>Wm7HYL&~CJCLgxJR7_j zEEwz^yzg&Ga3>=V$UeDR+E`3jRTfkCQ}n$`iGQX8r$Vxr*_MFRhsig1@Et8 zPcpu9^jk)aG3K6&_@3D9@ZGwe?;MUz<>vVdpZPiCjr{t$_w{CUdxZOI{MYIc(;X{U z3)ceIa92{-WLIP3@zz9t(@V+=sUI;b_+!R*K7$u(2PKQc(`lB*=&gRI$UeP~Lu;vz zmEamZVI_-sY=Yv|Hjho?OStRnC^i)9rixGyRDM3&-x2TtDBfgX9L0 z6;P{sz|OD3f#wCz(4!URPK6K!X$3I_y0;5yv}xRL%b^eY@xyY%Gs8S-_$v}eFII@# zF#Pf!A0}_3E{VzCk>d#S2FO0Bp03YE_TqX;IrUo6c$JuxfFU+pw~QMVzbb}phW5VA ztk0BiO=s?Ae$%^JDF1Z$smykf{F|_i-`QRIb?wQ*7SyeOc6KW=1-pUIq0gGrJd2LbtBmbve)?DF{`hwezl6?wCoZdx z#@ZflI!`2SBgYb!$ngUOZ)#7H*UKhV=JKpXhyzyc8=K_jQ&v7MD$Z9ew0yd@Kxw3@ zgS6o`YPQTYoSWHtW;rgJ2v{{cHM{y+_}=381K%49x5wYx+;Uy-FFKa4R$QM`X1lk`0gJq8cws? z&$;keH27Wi9d?nmK*OAE_>J04bL~9s7|>g%(q;E$%;oKc@xjsK?NN^y#e=s&`il;V z7cenMFclTgUZTSUeSZjqVZ#?${htbHo@y(eYC?E9dH(@I2R&7T{Iya<#of)x-A(Ry z-3GMCLq1_J(Z0$R~wZv9k?C1(yT?69`5hPcqXLTf$%a#9l9AU)^?j}S@`6pv1a|_caiT}de)Y{tqH}|J=KnGW#wfS$6pbY<@=7(0>xCwIz0{?ml zo$v4G!Hy0VjxP454q%9|_}|I?+WRCO@JXD%i1+_OI4;s?Qh~h>G&m} zuaAJ%7HT*;+6imBSVLbhAqf1h{l69Y_fCi;^!*dU>}>38EFcgIh*tx|#m~;h&&kWg z2I6OfuH(N@{x2)}L)lP&g8Cou?+*B+@&7Zn|19MHZ+85TMgC)vf0G9O<68eF#s14| z|6`GVS#|%Wf&a0{ze%zGGTZ-Hn-u#m zv;9BEBL9e=LGg@2l%7w~vHvMz_!pq23KTQv4Gmx+16A!z!4P|B@DC_%=Loj_E9&fT z5yrov&i(+8N!dZ{p|DCQl+4t@+|S$s1FAXNL6l5?1K<8!22d3WB>i*pzc-$=0aVccy}W;(^f#kFCslU0chyH`1Ly<( zCY|A50;B&mBKw5hb4C7dfIVd3-w}Jz$m-uAd(c^d3fAVX`hRHl=}^Ma{Rueumnqo) zE2vTe8h5pGw1lo4I@ccvL1dtc6U0Fb{1oif|7|4zP)An};NWg&_s2grDDQt(-QO*s zXzE}|4{>18RA+$7Y3gDt2MtU@-QuYUG5-@D2%X_C!Dd(e-?3;lL+HBx2m}K_zj3#J zmHRJn$v;DSg3?mb(rj#8JZ#V^CpI>2KIonkx`)CtdDx-G_*)?IPdZMCKWMn2w7==0 zbez!f+}wYDpyRnYpdV;Il!T1~%9mXVx@Uv(`aLC79Z#V_dP!(Fkev+#0`Y>l*x7!^ z1?kw>=otPYu9pkM0%~S<4rC~&KQ{n37Z(Q?zyk1R9}gD?)ZkAWz~TQY+=_7iIzwS=we$U&?!S+P!s`D6h(o=UGn;zf@Ux^bR~{QiN51tfzOz8%?u_+(c;KQ zhSD}T@LJ)eXGF>TK0gtN`{OwrawtjqNr zu=Ty=!&Y~Iu{Jf(7qWchg&`<|%RL{t1SLLk>*XB}8ke?Pd;Gj%M2wgYKkM)?7506b z)Cc-o(|s#&`Rh0lg~C4S#JPxi2#hpVu65C6obhw`Ur2K@=FM_GeSBSizlWDuDCw-V kPGfa=d?`gW`dlS1xI->&7F}1o7(8T1_&&7T@9RN#cXxA{fdBvi literal 0 HcmV?d00001 diff --git a/src/paperless_remote/tests/test_checks.py b/src/paperless_remote/tests/test_checks.py new file mode 100644 index 000000000..3810f1dcd --- /dev/null +++ b/src/paperless_remote/tests/test_checks.py @@ -0,0 +1,40 @@ +from django.test import TestCase +from django.test import override_settings + +from paperless_remote import check_remote_parser_configured + + +class TestChecks(TestCase): + @override_settings(REMOTE_PARSER_ENGINE=None) + def test_no_engine(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) + + @override_settings(REMOTE_PARSER_ENGINE="something") + @override_settings(REMOTE_PARSER_API_KEY=None) + def test_no_api_key(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 1) + self.assertTrue( + msgs[0].msg.startswith( + "No remote engine API key is configured.", + ), + ) + + @override_settings(REMOTE_PARSER_ENGINE="azureaivision") + @override_settings(REMOTE_PARSER_API_KEY="somekey") + @override_settings(REMOTE_PARSER_ENDPOINT=None) + def test_azure_no_endpoint(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 1) + self.assertTrue( + msgs[0].msg.startswith( + "Azure remote parser requires endpoint to be configured.", + ), + ) + + @override_settings(REMOTE_PARSER_ENGINE="something") + @override_settings(REMOTE_PARSER_API_KEY="somekey") + def test_valid_configuration(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py new file mode 100644 index 000000000..3706b20e3 --- /dev/null +++ b/src/paperless_remote/tests/test_parser.py @@ -0,0 +1,48 @@ +import uuid +from pathlib import Path +from unittest import mock + +from django.test import TestCase +from django.test import override_settings + +from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import FileSystemAssertsMixin +from paperless_remote.parsers import RemoteDocumentParser + + +class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + SAMPLE_FILES = Path(__file__).resolve().parent / "samples" + + def assertContainsStrings(self, content, strings): + # Asserts that all strings appear in content, in the given order. + indices = [] + for s in strings: + if s in content: + indices.append(content.index(s)) + else: + self.fail(f"'{s}' is not in '{content}'") + self.assertListEqual(indices, sorted(indices)) + + @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") + def test_get_text_with_azure(self, mock_begin_analyze_document): + result = mock.Mock() + result.content = "This is a test document." + mock_begin_analyze_document.return_value.result.return_value = result + + with override_settings( + REMOTE_PARSER_ENGINE="azureaivision", + REMOTE_PARSER_API_KEY="somekey", + REMOTE_PARSER_ENDPOINT="https://endpoint.cognitiveservices.azure.com/", + ): + parser = RemoteDocumentParser(uuid.uuid4()) + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) + + mock_begin_analyze_document.assert_called_once() + + self.assertContainsStrings( + parser.text.strip(), + ["This is a test document."], + )