diff --git a/Pipfile b/Pipfile index 32c761391..20b67c1df 100644 --- a/Pipfile +++ b/Pipfile @@ -22,6 +22,7 @@ djangorestframework = "~=3.14" djangorestframework-guardian = "*" drf-writable-nested = "*" bleach = "*" +boto3 = "*" celery = {extras = ["redis"], version = "*"} channels = "~=4.0" channels-redis = "*" @@ -37,7 +38,6 @@ langdetect = "*" mysqlclient = "*" nltk = "*" ocrmypdf = "~=15.4" -openai = "*" pathvalidate = "*" pdf2image = "*" psycopg2 = "*" diff --git a/Pipfile.lock b/Pipfile.lock index f023dceda..96ed87d98 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "56dcb96a9bc99b9902bfd3891d3b04f83715cfb9ae54f9d193442c90613e0ef9" + "sha256": "afea58891f3b1e0860daa8bc56b33b56fbe7c95c6b30d3fdc8cf1a25560e2d1a" }, "pipfile-spec": 6, "requires": {}, @@ -87,6 +87,23 @@ "markers": "python_version >= '3.8'", "version": "==6.1.0" }, + "boto3": { + "hashes": [ + "sha256:66303b5f26d92afb72656ff490b22ea72dfff8bf1a29e4a0c5d5f11ec56245dd", + "sha256:898ad2123b18cae8efd85adc56ac2d1925be54592aebc237020d4f16e9a9e7a9" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==1.34.52" + }, + "botocore": { + "hashes": [ + "sha256:05567d8aba344826060481ea309555432c96f0febe22bee7cf5a3b6d3a03cec8", + "sha256:187da93aec3f2e87d8a31eced16fa2cb9c71fe2d69b0a797f9f7a9220f5bf7ae" + ], + "markers": "python_version >= '3.8'", + "version": "==1.34.52" + }, "brotli": { "hashes": [ "sha256:03d20af184290887bdea3f0f78c4f737d126c74dc2f3ccadf07e54ceca3bf208", @@ -1106,6 +1123,14 @@ ], "version": "==0.6.1" }, + "jmespath": { + "hashes": [ + "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", + "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe" + ], + "markers": "python_version >= '3.7'", + "version": "==1.0.1" + }, "joblib": { "hashes": [ "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1", @@ -1132,101 +1157,87 @@ }, "lxml": { "hashes": [ - "sha256:00e91573183ad273e242db5585b52670eddf92bacad095ce25c1e682da14ed91", - "sha256:01bf1df1db327e748dcb152d17389cf6d0a8c5d533ef9bab781e9d5037619229", - "sha256:056a17eaaf3da87a05523472ae84246f87ac2f29a53306466c22e60282e54ff8", - "sha256:0a08c89b23117049ba171bf51d2f9c5f3abf507d65d016d6e0fa2f37e18c0fc5", - "sha256:1343df4e2e6e51182aad12162b23b0a4b3fd77f17527a78c53f0f23573663545", - "sha256:1449f9451cd53e0fd0a7ec2ff5ede4686add13ac7a7bfa6988ff6d75cff3ebe2", - "sha256:16b9ec51cc2feab009e800f2c6327338d6ee4e752c76e95a35c4465e80390ccd", - "sha256:1f10f250430a4caf84115b1e0f23f3615566ca2369d1962f82bef40dd99cd81a", - "sha256:231142459d32779b209aa4b4d460b175cadd604fed856f25c1571a9d78114771", - "sha256:232fd30903d3123be4c435fb5159938c6225ee8607b635a4d3fca847003134ba", - "sha256:23d891e5bdc12e2e506e7d225d6aa929e0a0368c9916c1fddefab88166e98b20", - "sha256:266f655d1baff9c47b52f529b5f6bec33f66042f65f7c56adde3fcf2ed62ae8b", - "sha256:273473d34462ae6e97c0f4e517bd1bf9588aa67a1d47d93f760a1282640e24ac", - "sha256:2bd9ac6e44f2db368ef8986f3989a4cad3de4cd55dbdda536e253000c801bcc7", - "sha256:33714fcf5af4ff7e70a49731a7cc8fd9ce910b9ac194f66eaa18c3cc0a4c02be", - "sha256:359a8b09d712df27849e0bcb62c6a3404e780b274b0b7e4c39a88826d1926c28", - "sha256:365005e8b0718ea6d64b374423e870648ab47c3a905356ab6e5a5ff03962b9a9", - "sha256:389d2b2e543b27962990ab529ac6720c3dded588cc6d0f6557eec153305a3622", - "sha256:3b505f2bbff50d261176e67be24e8909e54b5d9d08b12d4946344066d66b3e43", - "sha256:3d74d4a3c4b8f7a1f676cedf8e84bcc57705a6d7925e6daef7a1e54ae543a197", - "sha256:3f3f00a9061605725df1816f5713d10cd94636347ed651abdbc75828df302b20", - "sha256:43498ea734ccdfb92e1886dfedaebeb81178a241d39a79d5351ba2b671bff2b2", - "sha256:4855161013dfb2b762e02b3f4d4a21cc7c6aec13c69e3bffbf5022b3e708dd97", - "sha256:4d973729ce04784906a19108054e1fd476bc85279a403ea1a72fdb051c76fa48", - "sha256:4ece9cca4cd1c8ba889bfa67eae7f21d0d1a2e715b4d5045395113361e8c533d", - "sha256:506becdf2ecaebaf7f7995f776394fcc8bd8a78022772de66677c84fb02dd33d", - "sha256:520486f27f1d4ce9654154b4494cf9307b495527f3a2908ad4cb48e4f7ed7ef7", - "sha256:5557461f83bb7cc718bc9ee1f7156d50e31747e5b38d79cf40f79ab1447afd2d", - "sha256:562778586949be7e0d7435fcb24aca4810913771f845d99145a6cee64d5b67ca", - "sha256:59bb5979f9941c61e907ee571732219fa4774d5a18f3fa5ff2df963f5dfaa6bc", - "sha256:606d445feeb0856c2b424405236a01c71af7c97e5fe42fbc778634faef2b47e4", - "sha256:6197c3f3c0b960ad033b9b7d611db11285bb461fc6b802c1dd50d04ad715c225", - "sha256:647459b23594f370c1c01768edaa0ba0959afc39caeeb793b43158bb9bb6a663", - "sha256:647bfe88b1997d7ae8d45dabc7c868d8cb0c8412a6e730a7651050b8c7289cf2", - "sha256:6bee9c2e501d835f91460b2c904bc359f8433e96799f5c2ff20feebd9bb1e590", - "sha256:6dbdacf5752fbd78ccdb434698230c4f0f95df7dd956d5f205b5ed6911a1367c", - "sha256:701847a7aaefef121c5c0d855b2affa5f9bd45196ef00266724a80e439220e46", - "sha256:786d6b57026e7e04d184313c1359ac3d68002c33e4b1042ca58c362f1d09ff58", - "sha256:7b378847a09d6bd46047f5f3599cdc64fcb4cc5a5a2dd0a2af610361fbe77b16", - "sha256:7d1d6c9e74c70ddf524e3c09d9dc0522aba9370708c2cb58680ea40174800013", - "sha256:857d6565f9aa3464764c2cb6a2e3c2e75e1970e877c188f4aeae45954a314e0c", - "sha256:8671622256a0859f5089cbe0ce4693c2af407bc053dcc99aadff7f5310b4aa02", - "sha256:88f7c383071981c74ec1998ba9b437659e4fd02a3c4a4d3efc16774eb108d0ec", - "sha256:8aecb5a7f6f7f8fe9cac0bcadd39efaca8bbf8d1bf242e9f175cbe4c925116c3", - "sha256:91bbf398ac8bb7d65a5a52127407c05f75a18d7015a270fdd94bbcb04e65d573", - "sha256:936e8880cc00f839aa4173f94466a8406a96ddce814651075f95837316369899", - "sha256:953dd5481bd6252bd480d6ec431f61d7d87fdcbbb71b0d2bdcfc6ae00bb6fb10", - "sha256:95ae6c5a196e2f239150aa4a479967351df7f44800c93e5a975ec726fef005e2", - "sha256:9a2b5915c333e4364367140443b59f09feae42184459b913f0f41b9fed55794a", - "sha256:9ae6c3363261021144121427b1552b29e7b59de9d6a75bf51e03bc072efb3c37", - "sha256:9b556596c49fa1232b0fff4b0e69b9d4083a502e60e404b44341e2f8fb7187f5", - "sha256:9c131447768ed7bc05a02553d939e7f0e807e533441901dd504e217b76307745", - "sha256:9d9d5726474cbbef279fd709008f91a49c4f758bec9c062dfbba88eab00e3ff9", - "sha256:a1bdcbebd4e13446a14de4dd1825f1e778e099f17f79718b4aeaf2403624b0f7", - "sha256:a602ed9bd2c7d85bd58592c28e101bd9ff9c718fbde06545a70945ffd5d11868", - "sha256:a8edae5253efa75c2fc79a90068fe540b197d1c7ab5803b800fccfe240eed33c", - "sha256:a905affe76f1802edcac554e3ccf68188bea16546071d7583fb1b693f9cf756b", - "sha256:a9e7c6d89c77bb2770c9491d988f26a4b161d05c8ca58f63fb1f1b6b9a74be45", - "sha256:aa9b5abd07f71b081a33115d9758ef6077924082055005808f68feccb27616bd", - "sha256:aaa5c173a26960fe67daa69aa93d6d6a1cd714a6eb13802d4e4bd1d24a530644", - "sha256:ac7674d1638df129d9cb4503d20ffc3922bd463c865ef3cb412f2c926108e9a4", - "sha256:b1541e50b78e15fa06a2670157a1962ef06591d4c998b998047fff5e3236880e", - "sha256:b1980dbcaad634fe78e710c8587383e6e3f61dbe146bcbfd13a9c8ab2d7b1192", - "sha256:bafa65e3acae612a7799ada439bd202403414ebe23f52e5b17f6ffc2eb98c2be", - "sha256:bb5bd6212eb0edfd1e8f254585290ea1dadc3687dd8fd5e2fd9a87c31915cdab", - "sha256:bbdd69e20fe2943b51e2841fc1e6a3c1de460d630f65bde12452d8c97209464d", - "sha256:bc354b1393dce46026ab13075f77b30e40b61b1a53e852e99d3cc5dd1af4bc85", - "sha256:bcee502c649fa6351b44bb014b98c09cb00982a475a1912a9881ca28ab4f9cd9", - "sha256:bdd9abccd0927673cffe601d2c6cdad1c9321bf3437a2f507d6b037ef91ea307", - "sha256:c42ae7e010d7d6bc51875d768110c10e8a59494855c3d4c348b068f5fb81fdcd", - "sha256:c71b5b860c5215fdbaa56f715bc218e45a98477f816b46cfde4a84d25b13274e", - "sha256:c7721a3ef41591341388bb2265395ce522aba52f969d33dacd822da8f018aff8", - "sha256:ca8e44b5ba3edb682ea4e6185b49661fc22b230cf811b9c13963c9f982d1d964", - "sha256:cb53669442895763e61df5c995f0e8361b61662f26c1b04ee82899c2789c8f69", - "sha256:cc02c06e9e320869d7d1bd323df6dd4281e78ac2e7f8526835d3d48c69060683", - "sha256:d3caa09e613ece43ac292fbed513a4bce170681a447d25ffcbc1b647d45a39c5", - "sha256:d82411dbf4d3127b6cde7da0f9373e37ad3a43e89ef374965465928f01c2b979", - "sha256:dbcb2dc07308453db428a95a4d03259bd8caea97d7f0776842299f2d00c72fc8", - "sha256:dd4fda67f5faaef4f9ee5383435048ee3e11ad996901225ad7615bc92245bc8e", - "sha256:ddd92e18b783aeb86ad2132d84a4b795fc5ec612e3545c1b687e7747e66e2b53", - "sha256:de362ac8bc962408ad8fae28f3967ce1a262b5d63ab8cefb42662566737f1dc7", - "sha256:e214025e23db238805a600f1f37bf9f9a15413c7bf5f9d6ae194f84980c78722", - "sha256:e8f9f93a23634cfafbad6e46ad7d09e0f4a25a2400e4a64b1b7b7c0fbaa06d9d", - "sha256:e96a1788f24d03e8d61679f9881a883ecdf9c445a38f9ae3f3f193ab6c591c66", - "sha256:ec53a09aee61d45e7dbe7e91252ff0491b6b5fee3d85b2d45b173d8ab453efc1", - "sha256:f10250bb190fb0742e3e1958dd5c100524c2cc5096c67c8da51233f7448dc137", - "sha256:f1faee2a831fe249e1bae9cbc68d3cd8a30f7e37851deee4d7962b17c410dd56", - "sha256:f610d980e3fccf4394ab3806de6065682982f3d27c12d4ce3ee46a8183d64a6a", - "sha256:f6c35b2f87c004270fa2e703b872fcc984d714d430b305145c39d53074e1ffe0", - "sha256:f836f39678cb47c9541f04d8ed4545719dc31ad850bf1832d6b4171e30d65d23", - "sha256:f99768232f036b4776ce419d3244a04fe83784bce871b16d2c2e984c7fcea847", - "sha256:fd814847901df6e8de13ce69b84c31fc9b3fb591224d6762d0b256d510cbf382", - "sha256:fdb325b7fba1e2c40b9b1db407f85642e32404131c08480dd652110fc908561b" + "sha256:13521a321a25c641b9ea127ef478b580b5ec82aa2e9fc076c86169d161798b01", + "sha256:14deca1460b4b0f6b01f1ddc9557704e8b365f55c63070463f6c18619ebf964f", + "sha256:16018f7099245157564d7148165132c70adb272fb5a17c048ba70d9cc542a1a1", + "sha256:16dd953fb719f0ffc5bc067428fc9e88f599e15723a85618c45847c96f11f431", + "sha256:19a1bc898ae9f06bccb7c3e1dfd73897ecbbd2c96afe9095a6026016e5ca97b8", + "sha256:1ad17c20e3666c035db502c78b86e58ff6b5991906e55bdbef94977700c72623", + "sha256:22b7ee4c35f374e2c20337a95502057964d7e35b996b1c667b5c65c567d2252a", + "sha256:24ef5a4631c0b6cceaf2dbca21687e29725b7c4e171f33a8f8ce23c12558ded1", + "sha256:25663d6e99659544ee8fe1b89b1a8c0aaa5e34b103fab124b17fa958c4a324a6", + "sha256:262bc5f512a66b527d026518507e78c2f9c2bd9eb5c8aeeb9f0eb43fcb69dc67", + "sha256:280f3edf15c2a967d923bcfb1f8f15337ad36f93525828b40a0f9d6c2ad24890", + "sha256:2ad3a8ce9e8a767131061a22cd28fdffa3cd2dc193f399ff7b81777f3520e372", + "sha256:2befa20a13f1a75c751f47e00929fb3433d67eb9923c2c0b364de449121f447c", + "sha256:2f37c6d7106a9d6f0708d4e164b707037b7380fcd0b04c5bd9cae1fb46a856fb", + "sha256:304128394c9c22b6569eba2a6d98392b56fbdfbad58f83ea702530be80d0f9df", + "sha256:342e95bddec3a698ac24378d61996b3ee5ba9acfeb253986002ac53c9a5f6f84", + "sha256:3aeca824b38ca78d9ee2ab82bd9883083d0492d9d17df065ba3b94e88e4d7ee6", + "sha256:3d184e0d5c918cff04cdde9dbdf9600e960161d773666958c9d7b565ccc60c45", + "sha256:3e3898ae2b58eeafedfe99e542a17859017d72d7f6a63de0f04f99c2cb125936", + "sha256:3eea6ed6e6c918e468e693c41ef07f3c3acc310b70ddd9cc72d9ef84bc9564ca", + "sha256:3f14a4fb1c1c402a22e6a341a24c1341b4a3def81b41cd354386dcb795f83897", + "sha256:436a943c2900bb98123b06437cdd30580a61340fbdb7b28aaf345a459c19046a", + "sha256:4946e7f59b7b6a9e27bef34422f645e9a368cb2be11bf1ef3cafc39a1f6ba68d", + "sha256:49a9b4af45e8b925e1cd6f3b15bbba2c81e7dba6dce170c677c9cda547411e14", + "sha256:4f8b0c78e7aac24979ef09b7f50da871c2de2def043d468c4b41f512d831e912", + "sha256:52427a7eadc98f9e62cb1368a5079ae826f94f05755d2d567d93ee1bc3ceb354", + "sha256:5e53d7e6a98b64fe54775d23a7c669763451340c3d44ad5e3a3b48a1efbdc96f", + "sha256:5fcfbebdb0c5d8d18b84118842f31965d59ee3e66996ac842e21f957eb76138c", + "sha256:601f4a75797d7a770daed8b42b97cd1bb1ba18bd51a9382077a6a247a12aa38d", + "sha256:61c5a7edbd7c695e54fca029ceb351fc45cd8860119a0f83e48be44e1c464862", + "sha256:6a2a2c724d97c1eb8cf966b16ca2915566a4904b9aad2ed9a09c748ffe14f969", + "sha256:6d48fc57e7c1e3df57be5ae8614bab6d4e7b60f65c5457915c26892c41afc59e", + "sha256:6f11b77ec0979f7e4dc5ae081325a2946f1fe424148d3945f943ceaede98adb8", + "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e", + "sha256:725e171e0b99a66ec8605ac77fa12239dbe061482ac854d25720e2294652eeaa", + "sha256:7cfced4a069003d8913408e10ca8ed092c49a7f6cefee9bb74b6b3e860683b45", + "sha256:7ec465e6549ed97e9f1e5ed51c657c9ede767bc1c11552f7f4d022c4df4a977a", + "sha256:82bddf0e72cb2af3cbba7cec1d2fd11fda0de6be8f4492223d4a268713ef2147", + "sha256:82cd34f1081ae4ea2ede3d52f71b7be313756e99b4b5f829f89b12da552d3aa3", + "sha256:843b9c835580d52828d8f69ea4302537337a21e6b4f1ec711a52241ba4a824f3", + "sha256:877efb968c3d7eb2dad540b6cabf2f1d3c0fbf4b2d309a3c141f79c7e0061324", + "sha256:8b9f19df998761babaa7f09e6bc169294eefafd6149aaa272081cbddc7ba4ca3", + "sha256:8cf5877f7ed384dabfdcc37922c3191bf27e55b498fecece9fd5c2c7aaa34c33", + "sha256:8d2900b7f5318bc7ad8631d3d40190b95ef2aa8cc59473b73b294e4a55e9f30f", + "sha256:8d7b4beebb178e9183138f552238f7e6613162a42164233e2bda00cb3afac58f", + "sha256:8f52fe6859b9db71ee609b0c0a70fea5f1e71c3462ecf144ca800d3f434f0764", + "sha256:98f3f020a2b736566c707c8e034945c02aa94e124c24f77ca097c446f81b01f1", + "sha256:9aa543980ab1fbf1720969af1d99095a548ea42e00361e727c58a40832439114", + "sha256:9b99f564659cfa704a2dd82d0684207b1aadf7d02d33e54845f9fc78e06b7581", + "sha256:9bcf86dfc8ff3e992fed847c077bd875d9e0ba2fa25d859c3a0f0f76f07f0c8d", + "sha256:9bd0ae7cc2b85320abd5e0abad5ccee5564ed5f0cc90245d2f9a8ef330a8deae", + "sha256:9d3c0f8567ffe7502d969c2c1b809892dc793b5d0665f602aad19895f8d508da", + "sha256:9e5ac3437746189a9b4121db2a7b86056ac8786b12e88838696899328fc44bb2", + "sha256:a36c506e5f8aeb40680491d39ed94670487ce6614b9d27cabe45d94cd5d63e1e", + "sha256:a5ab722ae5a873d8dcee1f5f45ddd93c34210aed44ff2dc643b5025981908cda", + "sha256:a96f02ba1bcd330807fc060ed91d1f7a20853da6dd449e5da4b09bfcc08fdcf5", + "sha256:acb6b2f96f60f70e7f34efe0c3ea34ca63f19ca63ce90019c6cbca6b676e81fa", + "sha256:ae15347a88cf8af0949a9872b57a320d2605ae069bcdf047677318bc0bba45b1", + "sha256:af8920ce4a55ff41167ddbc20077f5698c2e710ad3353d32a07d3264f3a2021e", + "sha256:afd825e30f8d1f521713a5669b63657bcfe5980a916c95855060048b88e1adb7", + "sha256:b21b4031b53d25b0858d4e124f2f9131ffc1530431c6d1321805c90da78388d1", + "sha256:b4b68c961b5cc402cbd99cca5eb2547e46ce77260eb705f4d117fd9c3f932b95", + "sha256:b66aa6357b265670bb574f050ffceefb98549c721cf28351b748be1ef9577d93", + "sha256:b9e240ae0ba96477682aa87899d94ddec1cc7926f9df29b1dd57b39e797d5ab5", + "sha256:bc64d1b1dab08f679fb89c368f4c05693f58a9faf744c4d390d7ed1d8223869b", + "sha256:bf8443781533b8d37b295016a4b53c1494fa9a03573c09ca5104550c138d5c05", + "sha256:c26aab6ea9c54d3bed716b8851c8bfc40cb249b8e9880e250d1eddde9f709bf5", + "sha256:c3cd1fc1dc7c376c54440aeaaa0dcc803d2126732ff5c6b68ccd619f2e64be4f", + "sha256:c7257171bb8d4432fe9d6fdde4d55fdbe663a63636a17f7f9aaba9bcb3153ad7", + "sha256:d42e3a3fc18acc88b838efded0e6ec3edf3e328a58c68fbd36a7263a874906c8", + "sha256:d74fcaf87132ffc0447b3c685a9f862ffb5b43e70ea6beec2fb8057d5d2a1fea", + "sha256:d8c1d679df4361408b628f42b26a5d62bd3e9ba7f0c0e7969f925021554755aa", + "sha256:e856c1c7255c739434489ec9c8aa9cdf5179785d10ff20add308b5d673bed5cd", + "sha256:eac68f96539b32fce2c9b47eb7c25bb2582bdaf1bbb360d25f564ee9e04c542b", + "sha256:ed7326563024b6e91fef6b6c7a1a2ff0a71b97793ac33dbbcf38f6005e51ff6e", + "sha256:ed8c3d2cd329bf779b7ed38db176738f3f8be637bb395ce9629fc76f78afe3d4", + "sha256:f4c9bda132ad108b387c33fabfea47866af87f4ea6ffb79418004f0521e63204", + "sha256:f643ffd2669ffd4b5a3e9b41c909b72b2a1d5e4915da90a77e119b8d48ce867a" ], - "version": "==4.9.4" + "markers": "python_version >= '3.10'", + "version": "==5.1.0" }, "markdown-it-py": { "hashes": [ @@ -1399,15 +1410,6 @@ "markers": "python_version >= '3.9'", "version": "==15.4.4" }, - "openai": { - "hashes": [ - "sha256:99c5d257d09ea6533d689d1cc77caa0ac679fa21efef8893d8b0832a86877f1b", - "sha256:a54002c814e05222e413664f651b5916714e4700d041d5cf5724d3ae1a3e3481" - ], - "index": "pypi", - "markers": "python_full_version >= '3.7.1'", - "version": "==1.12.0" - }, "packaging": { "hashes": [ "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5", @@ -2045,6 +2047,14 @@ "markers": "python_version >= '3.6' and python_version < '4'", "version": "==4.9" }, + "s3transfer": { + "hashes": [ + "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e", + "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b" + ], + "markers": "python_version >= '3.8'", + "version": "==0.10.0" + }, "scikit-learn": { "hashes": [ "sha256:0df87de9ce1c0140f2818beef310fb2e2afdc1e66fc9ad587965577f17733649", @@ -2292,11 +2302,11 @@ }, "urllib3": { "hashes": [ - "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d", - "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19" + "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84", + "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e" ], - "markers": "python_version >= '3.8'", - "version": "==2.2.1" + "markers": "python_version >= '3.10'", + "version": "==2.0.7" }, "uvicorn": { "extras": [ @@ -3231,14 +3241,6 @@ "index": "pypi", "version": "==4.3.1" }, - "importlib-metadata": { - "hashes": [ - "sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e", - "sha256:f238736bb06590ae52ac1fab06a3a9ef1d8dce2b7a35b5ab329371d6c8f5d2cc" - ], - "markers": "python_version < '3.10'", - "version": "==7.0.1" - }, "incremental": { "hashes": [ "sha256:912feeb5e0f7e0188e6f42241d2f450002e11bbc0937c65865045854c24c0bd0", @@ -4065,14 +4067,6 @@ "markers": "python_version >= '3.8'", "version": "==4.0.0" }, - "zipp": { - "hashes": [ - "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31", - "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0" - ], - "markers": "python_version >= '3.8'", - "version": "==3.17.0" - }, "zope-interface": { "hashes": [ "sha256:02adbab560683c4eca3789cc0ac487dcc5f5a81cc48695ec247f00803cafe2fe", @@ -4510,7 +4504,6 @@ "sha256:9acd36fef264d9ed5a96345c45f7d80f0d967059e92213998b3046fbb64f67fc", "sha256:d6861d9d68e8268a5346d8a43d14727e6c636ebc6d49f2b8fc034c25996d35dd" ], - "index": "pypi", "markers": "python_version >= '3.8'", "version": "==3.5.0.20240129" }, @@ -4519,7 +4512,6 @@ "sha256:062c5a0f20301a30f2df4db583f15b3c2a1283a12518d1f9d81396154e12c1af", "sha256:4800b61bf7eabdae2f1b17ade0d080709ed33e9f26a2e900e470e8b56ebe2387" ], - "index": "pypi", "markers": "python_version >= '3.8'", "version": "==10.2.0.20240213" }, @@ -4537,7 +4529,6 @@ "sha256:8052c574b0ab8f2dc94bdc4a31b9d48e8aa5a0f12398ef40cadadbe551da949b", "sha256:92e62ac37793e567cd2b0f64f1456c24fccce4041d9c5f869697a6739fde4fce" ], - "index": "pypi", "markers": "python_version >= '3.8'", "version": "==2.17.0.20240106" }, diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 98121cb15..2d8de6e10 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -1158,3 +1158,5 @@ if DEBUG: # pragma: no cover REMOTE_PARSER_ENGINE = os.getenv("PAPERLESS_REMOTE_PARSER_ENGINE") REMOTE_PARSER_API_KEY = os.getenv("PAPERLESS_REMOTE_PARSER_API_KEY") REMOTE_PARSER_ENDPOINT = os.getenv("PAPERLESS_REMOTE_PARSER_ENDPOINT") +REMOTE_PARSER_API_KEY_ID = os.getenv("PAPERLESS_REMOTE_PARSER_API_KEY_ID") +REMOTE_PARSER_REGION = os.getenv("PAPERLESS_REMOTE_PARSER_REGION") diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py index 39ba4d305..2f9d2ee67 100644 --- a/src/paperless_remote/checks.py +++ b/src/paperless_remote/checks.py @@ -22,4 +22,13 @@ def check_remote_parser_configured(app_configs, **kwargs): ), ] + if settings.REMOTE_PARSER_ENGINE == "awstextract" and ( + not settings.REMOTE_PARSER_API_KEY_ID or not settings.REMOTE_PARSER_REGION + ): + return [ + Error( + "AWS Textract remote parser requires access key ID and region to be configured.", + ), + ] + return [] diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py index df92d6bf0..bccfcc1c1 100644 --- a/src/paperless_remote/parsers.py +++ b/src/paperless_remote/parsers.py @@ -8,15 +8,29 @@ from paperless_tesseract.parsers import RasterisedDocumentParser class RemoteEngineConfig: - def __init__(self, engine: str, api_key: str, endpoint: Optional[str] = None): + def __init__( + self, + engine: str, + api_key: str, + endpoint: Optional[str] = None, + api_key_id: Optional[str] = None, + region: Optional[str] = None, + ): self.engine = engine self.api_key = api_key self.endpoint = endpoint + self.api_key_id = api_key_id + self.region = region def engine_is_valid(self): - valid = self.engine in ["chatgpt", "azureaivision"] and self.api_key is not None + valid = ( + self.engine in ["azureaivision", "awstextract", "googlecloudvision"] + and self.api_key is not None + ) if self.engine == "azureaivision": valid = valid and self.endpoint is not None + if self.engine == "awstextract": + valid = valid and self.region is not None and self.api_key_id is not None return valid @@ -35,6 +49,8 @@ class RemoteDocumentParser(RasterisedDocumentParser): engine=settings.REMOTE_PARSER_ENGINE, api_key=settings.REMOTE_PARSER_API_KEY, endpoint=settings.REMOTE_PARSER_ENDPOINT, + api_key_id=settings.REMOTE_PARSER_API_KEY_ID, + region=settings.REMOTE_PARSER_REGION, ) def supported_mime_types(self): @@ -57,47 +73,36 @@ class RemoteDocumentParser(RasterisedDocumentParser): else: return [] - def chatgpt_parse( + def aws_textract_parse( self, file: Path, ) -> Optional[str]: - # does not work - from openai import OpenAI + import boto3 - client = OpenAI( - api_key=self.settings.api_key, + client = boto3.client( + "textract", + region_name=self.settings.region, + aws_access_key_id=self.settings.api_key_id, + aws_secret_access_key=self.settings.api_key, ) - assistants = client.beta.assistants.list() - for assistant in assistants.data: - if assistant.name == "Paperless-ngx Document Parser": - assistant = assistant - break - if not assistant: - assistant = client.beta.assistants.create( - model="gpt-3.5-turbo", - tools=[{"type": "code_interpreter"}], - name="Paperless-ngx Document Parser", - ) - self.log.info("Uploading document to OpenAI...") - gpt_file = client.files.create(file=file, purpose="assistants") - client.files.wait_for_processing(gpt_file.id) - client.beta.assistants.update(assistant_id=assistant.id, files=[gpt_file.id]) - thread = client.beta.threads.create() - client.beta.threads.messages.create( - thread_id=thread.id, - role="user", - content="Output the text of the file", + lines = [] + with open(file, "rb") as f: + file_bytes = f.read() + file_bytearray = bytearray(file_bytes) + + self.log.info("Analyzing document with AWS Textract...") + response = client.analyze_document( + Document={"Bytes": file_bytearray}, + FeatureTypes=["TABLES"], ) - client.beta.threads.runs.create( - thread_id=thread, - assistant_id=assistant.id, - ) - response = client.beta.threads.messages.list( - thread_id=thread.id, - ) - self.text = response.data[0].content[0].text.value - client.files.delete(gpt_file.id) + + blocks = response["Blocks"] + for block in blocks: + if block["BlockType"] == "LINE": + lines.append(block["Text"]) + + return "\n".join(lines) def azure_ai_vision_parse( self, @@ -197,15 +202,9 @@ class RemoteDocumentParser(RasterisedDocumentParser): ) self.text = "" return - elif self.settings.engine == "chatgpt": - self.text = self.chatgpt_parse(document_path) elif self.settings.engine == "azureaivision": self.text = self.azure_ai_vision_parse(document_path) + elif self.settings.engine == "awstextract": + self.text = self.aws_textract_parse(document_path) elif self.settings.engine == "googlecloudvision": self.text = self.google_cloud_vision_parse(document_path, mime_type) - else: - self.log.warning( - "No valid remote parser engine is configured, content will be empty.", - ) - self.text = "" - return diff --git a/src/paperless_remote/tests/test_checks.py b/src/paperless_remote/tests/test_checks.py index 3810f1dcd..88b3a2d6f 100644 --- a/src/paperless_remote/tests/test_checks.py +++ b/src/paperless_remote/tests/test_checks.py @@ -33,6 +33,19 @@ class TestChecks(TestCase): ), ) + @override_settings(REMOTE_PARSER_ENGINE="awstextract") + @override_settings(REMOTE_PARSER_API_KEY="somekey") + @override_settings(REMOTE_PARSER_API_KEY_ID=None) + @override_settings(REMOTE_PARSER_REGION=None) + def test_aws_no_id_or_region(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 1) + self.assertTrue( + msgs[0].msg.startswith( + "AWS Textract remote parser requires access key ID and region to be configured.", + ), + ) + @override_settings(REMOTE_PARSER_ENGINE="something") @override_settings(REMOTE_PARSER_API_KEY="somekey") def test_valid_configuration(self): diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py index af636203c..ebdea8498 100644 --- a/src/paperless_remote/tests/test_parser.py +++ b/src/paperless_remote/tests/test_parser.py @@ -1,9 +1,13 @@ +import uuid from pathlib import Path +from unittest import mock from django.test import TestCase +from django.test import override_settings from documents.tests.utils import DirectoriesMixin from documents.tests.utils import FileSystemAssertsMixin +from paperless_remote.parsers import RemoteDocumentParser class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): @@ -19,27 +23,55 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.fail(f"'{s}' is not in '{content}'") self.assertListEqual(indices, sorted(indices)) - # Currently test is not working on 3.11 on CI but works locally. Dont know why. - # @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") - # def test_get_text_with_azure(self, mock_begin_analyze_document): - # result = mock.Mock() - # result.content = "This is a test document." - # mock_begin_analyze_document.return_value.result.return_value = result + @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient") + def test_get_text_with_azure(self, mock_azure_client): + result = mock.Mock() + result.content = "This is a test document." - # with override_settings( - # REMOTE_PARSER_ENGINE="azureaivision", - # REMOTE_PARSER_API_KEY="somekey", - # REMOTE_PARSER_ENDPOINT="https://endpoint.cognitiveservices.azure.com/", - # ): - # parser = RemoteDocumentParser(uuid.uuid4()) - # parser.parse( - # self.SAMPLE_FILES / "simple-digital.pdf", - # "application/pdf", - # ) + mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = ( + result + ) - # mock_begin_analyze_document.assert_called_once() + with override_settings( + REMOTE_PARSER_ENGINE="azureaivision", + REMOTE_PARSER_API_KEY="somekey", + REMOTE_PARSER_ENDPOINT="https://endpoint.cognitiveservices.azure.com/", + ): + parser = RemoteDocumentParser(uuid.uuid4()) + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) - # self.assertContainsStrings( - # parser.text.strip(), - # ["This is a test document."], - # ) + self.assertContainsStrings( + parser.text.strip(), + ["This is a test document."], + ) + + @mock.patch("boto3.client") + def test_get_text_with_awstextract(self, mock_aws_client): + mock_aws_client.return_value.analyze_document.return_value = { + "Blocks": [ + { + "BlockType": "LINE", + "Text": "This is a test document.", + }, + ], + } + + with override_settings( + REMOTE_PARSER_ENGINE="awstextract", + REMOTE_PARSER_API_KEY="somekey", + REMOTE_PARSER_API_KEY_ID="somekeyid", + REMOTE_PARSER_REGION="us-west-2", + ): + parser = RemoteDocumentParser(uuid.uuid4()) + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) + + self.assertContainsStrings( + parser.text.strip(), + ["This is a test document."], + )