diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 697024a23..23ace6a3a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -106,6 +106,10 @@ jobs: PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }} PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }} PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }} + # Skip Tests which require convert + PAPERLESS_TEST_SKIP_CONVERT: 1 + # Enable Gotenberg end to end testing + GOTENBERG_LIVE: 1 steps: - name: Checkout diff --git a/Pipfile b/Pipfile index dad9a4760..e7702898a 100644 --- a/Pipfile +++ b/Pipfile @@ -60,6 +60,9 @@ setproctitle = "*" nltk = "*" pdf2image = "*" flower = "*" +bleach = "*" +# https://www.piwheels.org/project/cryptography/ last built version +cryptography = "==38.0.1" [dev-packages] coveralls = "*" @@ -76,4 +79,5 @@ black = "*" pre-commit = "*" sphinx-autobuild = "*" myst-parser = "*" +imagehash = "*" mkdocs-material = "*" diff --git a/Pipfile.lock b/Pipfile.lock index d00e7029f..844668913 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "0242e3e296e09b30fb69e0d7a2f2e8feb4c6a23d3c7ec99500f2883a032a8c84" + "sha256": "cbfe9920231de6e7f993962efb3cc371abdb6b08975232d4cf64d1bad1b53d7a" }, "pipfile-spec": 6, "requires": {}, @@ -110,6 +110,14 @@ ], "version": "==3.6.4.0" }, + "bleach": { + "hashes": [ + "sha256:085f7f33c15bd408dd9b17a4ad77c577db66d76203e5984b1bd59baeee948b2a", + "sha256:0d03255c47eb9bd2f26aa9bb7f2107732e7e8fe195ca2f64709fcf3b0a4a085c" + ], + "index": "pypi", + "version": "==5.0.1" + }, "celery": { "extras": [ "redis" @@ -219,7 +227,7 @@ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.1.1" }, "click": { @@ -235,7 +243,7 @@ "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667", "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035" ], - "markers": "python_version < '4' and python_full_version >= '3.6.2'", + "markers": "python_full_version >= '3.6.2' and python_full_version < '4.0.0'", "version": "==0.3.0" }, "click-plugins": { @@ -1625,7 +1633,7 @@ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa", "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e" ], - "markers": "python_version < '3.10'", + "markers": "python_version >= '3.7'", "version": "==4.4.0" }, "tzdata": { @@ -1767,6 +1775,13 @@ ], "version": "==0.2.5" }, + "webencodings": { + "hashes": [ + "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", + "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" + ], + "version": "==0.5.1" + }, "websockets": { "hashes": [ "sha256:00213676a2e46b6ebf6045bc11d0f529d9120baa6f58d122b4021ad92adabd41", @@ -2055,7 +2070,7 @@ "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" ], - "markers": "python_version >= '3.6'", + "markers": "python_full_version >= '3.6.0'", "version": "==2.1.1" }, "click": { @@ -2075,9 +2090,7 @@ "version": "==0.4.6" }, "coverage": { - "extras": [ - "toml" - ], + "extras": [], "hashes": [ "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79", "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a", @@ -2225,6 +2238,14 @@ "markers": "python_version >= '3.5'", "version": "==3.4" }, + "imagehash": { + "hashes": [ + "sha256:5ad9a5cde14fe255745a8245677293ac0d67f09c330986a351f34b614ba62fb5", + "sha256:7038d1b7f9e0585beb3dd8c0a956f02b95a346c0b5f24a9e8cc03ebadaf0aa70" + ], + "index": "pypi", + "version": "==4.3.1" + }, "imagesize": { "hashes": [ "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", @@ -2395,6 +2416,40 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'", "version": "==1.7.0" }, + "numpy": { + "hashes": [ + "sha256:0fe563fc8ed9dc4474cbf70742673fc4391d70f4363f917599a7fa99f042d5a8", + "sha256:12ac457b63ec8ded85d85c1e17d85efd3c2b0967ca39560b307a35a6703a4735", + "sha256:2341f4ab6dba0834b685cce16dad5f9b6606ea8a00e6da154f5dbded70fdc4dd", + "sha256:296d17aed51161dbad3c67ed6d164e51fcd18dbcd5dd4f9d0a9c6055dce30810", + "sha256:488a66cb667359534bc70028d653ba1cf307bae88eab5929cd707c761ff037db", + "sha256:4d52914c88b4930dafb6c48ba5115a96cbab40f45740239d9f4159c4ba779962", + "sha256:5e13030f8793e9ee42f9c7d5777465a560eb78fa7e11b1c053427f2ccab90c79", + "sha256:61be02e3bf810b60ab74e81d6d0d36246dbfb644a462458bb53b595791251911", + "sha256:7607b598217745cc40f751da38ffd03512d33ec06f3523fb0b5f82e09f6f676d", + "sha256:7a70a7d3ce4c0e9284e92285cba91a4a3f5214d87ee0e95928f3614a256a1488", + "sha256:7ab46e4e7ec63c8a5e6dbf5c1b9e1c92ba23a7ebecc86c336cb7bf3bd2fb10e5", + "sha256:8981d9b5619569899666170c7c9748920f4a5005bf79c72c07d08c8a035757b0", + "sha256:8c053d7557a8f022ec823196d242464b6955a7e7e5015b719e76003f63f82d0f", + "sha256:926db372bc4ac1edf81cfb6c59e2a881606b409ddc0d0920b988174b2e2a767f", + "sha256:95d79ada05005f6f4f337d3bb9de8a7774f259341c70bc88047a1f7b96a4bcb2", + "sha256:95de7dc7dc47a312f6feddd3da2500826defdccbc41608d0031276a24181a2c0", + "sha256:a0882323e0ca4245eb0a3d0a74f88ce581cc33aedcfa396e415e5bba7bf05f68", + "sha256:a8365b942f9c1a7d0f0dc974747d99dd0a0cdfc5949a33119caf05cb314682d3", + "sha256:a8aae2fb3180940011b4862b2dd3756616841c53db9734b27bb93813cd79fce6", + "sha256:c237129f0e732885c9a6076a537e974160482eab8f10db6292e92154d4c67d71", + "sha256:c67b833dbccefe97cdd3f52798d430b9d3430396af7cdb2a0c32954c3ef73894", + "sha256:ce03305dd694c4873b9429274fd41fc7eb4e0e4dea07e0af97a933b079a5814f", + "sha256:d331afac87c92373826af83d2b2b435f57b17a5c74e6268b79355b970626e329", + "sha256:dada341ebb79619fe00a291185bba370c9803b1e1d7051610e01ed809ef3a4ba", + "sha256:ed2cc92af0efad20198638c69bb0fc2870a58dabfba6eb722c933b48556c686c", + "sha256:f260da502d7441a45695199b4e7fd8ca87db659ba1c78f2bbf31f934fe76ae0e", + "sha256:f2f390aa4da44454db40a1f0201401f9036e8d578a25f01a6e237cea238337ef", + "sha256:f76025acc8e2114bb664294a07ede0727aa75d63a06d2fae96bf29a81747e4a7" + ], + "index": "pypi", + "version": "==1.23.4" + }, "packaging": { "hashes": [ "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", @@ -2411,6 +2466,73 @@ "markers": "python_version >= '3.7'", "version": "==0.10.1" }, + "pillow": { + "hashes": [ + "sha256:03150abd92771742d4a8cd6f2fa6246d847dcd2e332a18d0c15cc75bf6703040", + "sha256:073adb2ae23431d3b9bcbcff3fe698b62ed47211d0716b067385538a1b0f28b8", + "sha256:0b07fffc13f474264c336298d1b4ce01d9c5a011415b79d4ee5527bb69ae6f65", + "sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2", + "sha256:12ce4932caf2ddf3e41d17fc9c02d67126935a44b86df6a206cf0d7161548627", + "sha256:15c42fb9dea42465dfd902fb0ecf584b8848ceb28b41ee2b58f866411be33f07", + "sha256:18498994b29e1cf86d505edcb7edbe814d133d2232d256db8c7a8ceb34d18cef", + "sha256:1c7c8ae3864846fc95f4611c78129301e203aaa2af813b703c55d10cc1628535", + "sha256:22b012ea2d065fd163ca096f4e37e47cd8b59cf4b0fd47bfca6abb93df70b34c", + "sha256:276a5ca930c913f714e372b2591a22c4bd3b81a418c0f6635ba832daec1cbcfc", + "sha256:2e0918e03aa0c72ea56edbb00d4d664294815aa11291a11504a377ea018330d3", + "sha256:3033fbe1feb1b59394615a1cafaee85e49d01b51d54de0cbf6aa8e64182518a1", + "sha256:3168434d303babf495d4ba58fc22d6604f6e2afb97adc6a423e917dab828939c", + "sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa", + "sha256:3dd6caf940756101205dffc5367babf288a30043d35f80936f9bfb37f8355b32", + "sha256:40e1ce476a7804b0fb74bcfa80b0a2206ea6a882938eaba917f7a0f004b42502", + "sha256:41e0051336807468be450d52b8edd12ac60bebaa97fe10c8b660f116e50b30e4", + "sha256:4390e9ce199fc1951fcfa65795f239a8a4944117b5935a9317fb320e7767b40f", + "sha256:502526a2cbfa431d9fc2a079bdd9061a2397b842bb6bc4239bb176da00993812", + "sha256:51e0e543a33ed92db9f5ef69a0356e0b1a7a6b6a71b80df99f1d181ae5875636", + "sha256:57751894f6618fd4308ed8e0c36c333e2f5469744c34729a27532b3db106ee20", + "sha256:5d77adcd56a42d00cc1be30843d3426aa4e660cab4a61021dc84467123f7a00c", + "sha256:655a83b0058ba47c7c52e4e2df5ecf484c1b0b0349805896dd350cbc416bdd91", + "sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe", + "sha256:6c738585d7a9961d8c2821a1eb3dcb978d14e238be3d70f0a706f7fa9316946b", + "sha256:73bd195e43f3fadecfc50c682f5055ec32ee2c933243cafbfdec69ab1aa87cad", + "sha256:772a91fc0e03eaf922c63badeca75e91baa80fe2f5f87bdaed4280662aad25c9", + "sha256:77ec3e7be99629898c9a6d24a09de089fa5356ee408cdffffe62d67bb75fdd72", + "sha256:7db8b751ad307d7cf238f02101e8e36a128a6cb199326e867d1398067381bff4", + "sha256:801ec82e4188e935c7f5e22e006d01611d6b41661bba9fe45b60e7ac1a8f84de", + "sha256:82409ffe29d70fd733ff3c1025a602abb3e67405d41b9403b00b01debc4c9a29", + "sha256:828989c45c245518065a110434246c44a56a8b2b2f6347d1409c787e6e4651ee", + "sha256:829f97c8e258593b9daa80638aee3789b7df9da5cf1336035016d76f03b8860c", + "sha256:871b72c3643e516db4ecf20efe735deb27fe30ca17800e661d769faab45a18d7", + "sha256:89dca0ce00a2b49024df6325925555d406b14aa3efc2f752dbb5940c52c56b11", + "sha256:90fb88843d3902fe7c9586d439d1e8c05258f41da473952aa8b328d8b907498c", + "sha256:97aabc5c50312afa5e0a2b07c17d4ac5e865b250986f8afe2b02d772567a380c", + "sha256:9aaa107275d8527e9d6e7670b64aabaaa36e5b6bd71a1015ddd21da0d4e06448", + "sha256:9f47eabcd2ded7698106b05c2c338672d16a6f2a485e74481f524e2a23c2794b", + "sha256:a0a06a052c5f37b4ed81c613a455a81f9a3a69429b4fd7bb913c3fa98abefc20", + "sha256:ab388aaa3f6ce52ac1cb8e122c4bd46657c15905904b3120a6248b5b8b0bc228", + "sha256:ad58d27a5b0262c0c19b47d54c5802db9b34d38bbf886665b626aff83c74bacd", + "sha256:ae5331c23ce118c53b172fa64a4c037eb83c9165aba3a7ba9ddd3ec9fa64a699", + "sha256:af0372acb5d3598f36ec0914deed2a63f6bcdb7b606da04dc19a88d31bf0c05b", + "sha256:afa4107d1b306cdf8953edde0534562607fe8811b6c4d9a486298ad31de733b2", + "sha256:b03ae6f1a1878233ac620c98f3459f79fd77c7e3c2b20d460284e1fb370557d4", + "sha256:b0915e734b33a474d76c28e07292f196cdf2a590a0d25bcc06e64e545f2d146c", + "sha256:b4012d06c846dc2b80651b120e2cdd787b013deb39c09f407727ba90015c684f", + "sha256:b472b5ea442148d1c3e2209f20f1e0bb0eb556538690fa70b5e1f79fa0ba8dc2", + "sha256:b59430236b8e58840a0dfb4099a0e8717ffb779c952426a69ae435ca1f57210c", + "sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3", + "sha256:b9a65733d103311331875c1dca05cb4606997fd33d6acfed695b1232ba1df193", + "sha256:bac18ab8d2d1e6b4ce25e3424f709aceef668347db8637c2296bcf41acb7cf48", + "sha256:bca31dd6014cb8b0b2db1e46081b0ca7d936f856da3b39744aef499db5d84d02", + "sha256:be55f8457cd1eac957af0c3f5ece7bc3f033f89b114ef30f710882717670b2a8", + "sha256:c7025dce65566eb6e89f56c9509d4f628fddcedb131d9465cacd3d8bac337e7e", + "sha256:c935a22a557a560108d780f9a0fc426dd7459940dc54faa49d83249c8d3e760f", + "sha256:dbb8e7f2abee51cef77673be97760abff1674ed32847ce04b4af90f610144c7b", + "sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74", + "sha256:ebf2029c1f464c59b8bdbe5143c79fa2045a581ac53679733d3a91d400ff9efb", + "sha256:f1ff2ee69f10f13a9596480335f406dd1f70c3650349e2be67ca3139280cade0" + ], + "index": "pypi", + "version": "==9.3.0" + }, "platformdirs": { "hashes": [ "sha256:0cb405749187a194f444c25c82ef7225232f11564721eabffc6ec70df83b11cb", @@ -2538,6 +2660,37 @@ ], "version": "==2022.6" }, + "pywavelets": { + "hashes": [ + "sha256:030670a213ee8fefa56f6387b0c8e7d970c7f7ad6850dc048bd7c89364771b9b", + "sha256:058b46434eac4c04dd89aeef6fa39e4b6496a951d78c500b6641fd5b2cc2f9f4", + "sha256:231b0e0b1cdc1112f4af3c24eea7bf181c418d37922a67670e9bf6cfa2d544d4", + "sha256:23bafd60350b2b868076d976bdd92f950b3944f119b4754b1d7ff22b7acbf6c6", + "sha256:3f19327f2129fb7977bc59b966b4974dfd72879c093e44a7287500a7032695de", + "sha256:47cac4fa25bed76a45bc781a293c26ac63e8eaae9eb8f9be961758d22b58649c", + "sha256:578af438a02a86b70f1975b546f68aaaf38f28fb082a61ceb799816049ed18aa", + "sha256:6437af3ddf083118c26d8f97ab43b0724b956c9f958e9ea788659f6a2834ba93", + "sha256:64c6bac6204327321db30b775060fbe8e8642316e6bff17f06b9f34936f88875", + "sha256:67a0d28a08909f21400cb09ff62ba94c064882ffd9e3a6b27880a111211d59bd", + "sha256:71ab30f51ee4470741bb55fc6b197b4a2b612232e30f6ac069106f0156342356", + "sha256:7231461d7a8eb3bdc7aa2d97d9f67ea5a9f8902522818e7e2ead9c2b3408eeb1", + "sha256:754fa5085768227c4f4a26c1e0c78bc509a266d9ebd0eb69a278be7e3ece943c", + "sha256:7ab8d9db0fe549ab2ee0bea61f614e658dd2df419d5b75fba47baa761e95f8f2", + "sha256:875d4d620eee655346e3589a16a73790cf9f8917abba062234439b594e706784", + "sha256:88aa5449e109d8f5e7f0adef85f7f73b1ab086102865be64421a3a3d02d277f4", + "sha256:91d3d393cffa634f0e550d88c0e3f217c96cfb9e32781f2960876f1808d9b45b", + "sha256:9cb5ca8d11d3f98e89e65796a2125be98424d22e5ada360a0dbabff659fca0fc", + "sha256:ab7da0a17822cd2f6545626946d3b82d1a8e106afc4b50e3387719ba01c7b966", + "sha256:ad987748f60418d5f4138db89d82ba0cb49b086e0cbb8fd5c3ed4a814cfb705e", + "sha256:d0e56cd7a53aed3cceca91a04d62feb3a0aca6725b1912d29546c26f6ea90426", + "sha256:d854411eb5ee9cb4bc5d0e66e3634aeb8f594210f6a1bed96dbed57ec70f181c", + "sha256:da7b9c006171be1f9ddb12cc6e0d3d703b95f7f43cb5e2c6f5f15d3233fcf202", + "sha256:daf0aa79842b571308d7c31a9c43bc99a30b6328e6aea3f50388cd8f69ba7dbc", + "sha256:de7cd61a88a982edfec01ea755b0740e94766e00a1ceceeafef3ed4c85c605cd" + ], + "markers": "python_version >= '3.8'", + "version": "==1.4.1" + }, "pyyaml": { "hashes": [ "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf", @@ -2599,6 +2752,35 @@ "markers": "python_version >= '3.7' and python_version < '4'", "version": "==2.28.1" }, + "scipy": { + "hashes": [ + "sha256:02b567e722d62bddd4ac253dafb01ce7ed8742cf8031aea030a41414b86c1125", + "sha256:1166514aa3bbf04cb5941027c6e294a000bba0cf00f5cdac6c77f2dad479b434", + "sha256:1da52b45ce1a24a4a22db6c157c38b39885a990a566748fc904ec9f03ed8c6ba", + "sha256:23b22fbeef3807966ea42d8163322366dd89da9bebdc075da7034cee3a1441ca", + "sha256:28d2cab0c6ac5aa131cc5071a3a1d8e1366dad82288d9ec2ca44df78fb50e649", + "sha256:2ef0fbc8bcf102c1998c1f16f15befe7cffba90895d6e84861cd6c6a33fb54f6", + "sha256:3b69b90c9419884efeffaac2c38376d6ef566e6e730a231e15722b0ab58f0328", + "sha256:4b93ec6f4c3c4d041b26b5f179a6aab8f5045423117ae7a45ba9710301d7e462", + "sha256:4e53a55f6a4f22de01ffe1d2f016e30adedb67a699a310cdcac312806807ca81", + "sha256:6311e3ae9cc75f77c33076cb2794fb0606f14c8f1b1c9ff8ce6005ba2c283621", + "sha256:65b77f20202599c51eb2771d11a6b899b97989159b7975e9b5259594f1d35ef4", + "sha256:6cc6b33139eb63f30725d5f7fa175763dc2df6a8f38ddf8df971f7c345b652dc", + "sha256:70de2f11bf64ca9921fda018864c78af7147025e467ce9f4a11bc877266900a6", + "sha256:70ebc84134cf0c504ce6a5f12d6db92cb2a8a53a49437a6bb4edca0bc101f11c", + "sha256:83606129247e7610b58d0e1e93d2c5133959e9cf93555d3c27e536892f1ba1f2", + "sha256:93d07494a8900d55492401917a119948ed330b8c3f1d700e0b904a578f10ead4", + "sha256:9c4e3ae8a716c8b3151e16c05edb1daf4cb4d866caa385e861556aff41300c14", + "sha256:9dd4012ac599a1e7eb63c114d1eee1bcfc6dc75a29b589ff0ad0bb3d9412034f", + "sha256:9e3fb1b0e896f14a85aa9a28d5f755daaeeb54c897b746df7a55ccb02b340f33", + "sha256:a0aa8220b89b2e3748a2836fbfa116194378910f1a6e78e4675a095bcd2c762d", + "sha256:d3b3c8924252caaffc54d4a99f1360aeec001e61267595561089f8b5900821bb", + "sha256:e013aed00ed776d790be4cb32826adb72799c61e318676172495383ba4570aa4", + "sha256:f3e7a8867f307e3359cc0ed2c63b61a1e33a19080f92fe377bc7d49f646f2ec1" + ], + "index": "pypi", + "version": "==1.8.1" + }, "setuptools": { "hashes": [ "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31", @@ -2714,7 +2896,7 @@ "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" ], - "markers": "python_version < '3.11' and python_version >= '3.7'", + "markers": "python_full_version < '3.11.0a7'", "version": "==2.0.1" }, "tornado": { @@ -2747,7 +2929,7 @@ "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa", "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e" ], - "markers": "python_version < '3.10'", + "markers": "python_version >= '3.7'", "version": "==4.4.0" }, "urllib3": { diff --git a/docker/compose/docker-compose.ci-test.yml b/docker/compose/docker-compose.ci-test.yml index 87bc8b7f2..b1b8d2179 100644 --- a/docker/compose/docker-compose.ci-test.yml +++ b/docker/compose/docker-compose.ci-test.yml @@ -11,9 +11,12 @@ services: container_name: gotenberg network_mode: host restart: unless-stopped + # The gotenberg chromium route is used to convert .eml files. We do not + # want to allow external content like tracking pixels or even javascript. command: - "gotenberg" - - "--chromium-disable-routes=true" + - "--chromium-disable-javascript=true" + - "--chromium-allow-list=file:///tmp/.*" tika: image: ghcr.io/paperless-ngx/tika:latest hostname: tika diff --git a/docker/compose/docker-compose.mariadb-tika.yml b/docker/compose/docker-compose.mariadb-tika.yml index 22f69ba4f..4bbb390f0 100644 --- a/docker/compose/docker-compose.mariadb-tika.yml +++ b/docker/compose/docker-compose.mariadb-tika.yml @@ -87,9 +87,12 @@ services: gotenberg: image: docker.io/gotenberg/gotenberg:7.6 restart: unless-stopped + # The gotenberg chromium route is used to convert .eml files. We do not + # want to allow external content like tracking pixels or even javascript. command: - "gotenberg" - - "--chromium-disable-routes=true" + - "--chromium-disable-javascript=true" + - "--chromium-allow-list=file:///tmp/.*" tika: image: ghcr.io/paperless-ngx/tika:latest diff --git a/docker/compose/docker-compose.postgres-tika.yml b/docker/compose/docker-compose.postgres-tika.yml index c4333ad35..1158e7d67 100644 --- a/docker/compose/docker-compose.postgres-tika.yml +++ b/docker/compose/docker-compose.postgres-tika.yml @@ -79,9 +79,13 @@ services: gotenberg: image: docker.io/gotenberg/gotenberg:7.6 restart: unless-stopped + + # The gotenberg chromium route is used to convert .eml files. We do not + # want to allow external content like tracking pixels or even javascript. command: - "gotenberg" - - "--chromium-disable-routes=true" + - "--chromium-disable-javascript=true" + - "--chromium-allow-list=file:///tmp/.*" tika: image: ghcr.io/paperless-ngx/tika:latest diff --git a/docker/compose/docker-compose.sqlite-tika.yml b/docker/compose/docker-compose.sqlite-tika.yml index a7b9a4ebe..a331c1ad1 100644 --- a/docker/compose/docker-compose.sqlite-tika.yml +++ b/docker/compose/docker-compose.sqlite-tika.yml @@ -67,9 +67,13 @@ services: gotenberg: image: docker.io/gotenberg/gotenberg:7.6 restart: unless-stopped + + # The gotenberg chromium route is used to convert .eml files. We do not + # want to allow external content like tracking pixels or even javascript. command: - "gotenberg" - - "--chromium-disable-routes=true" + - "--chromium-disable-javascript=true" + - "--chromium-allow-list=file:///tmp/.*" tika: image: ghcr.io/paperless-ngx/tika:latest diff --git a/docs/configuration.md b/docs/configuration.md index ec4cf7765..bcde72e5f 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -565,8 +565,10 @@ they use underscores instead of dashes. Paperless can make use of [Tika](https://tika.apache.org/) and [Gotenberg](https://gotenberg.dev/) for parsing and converting -"Office" documents (such as ".doc", ".xlsx" and ".odt"). If you -wish to use this, you must provide a Tika server and a Gotenberg server, +"Office" documents (such as ".doc", ".xlsx" and ".odt"). +Tika and Gotenberg are also needed to allow parsing of E-Mails (.eml). + +If you wish to use this, you must provide a Tika server and a Gotenberg server, configure their endpoints, and enable the feature. `PAPERLESS_TIKA_ENABLED=` @@ -605,14 +607,17 @@ services: PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000 PAPERLESS_TIKA_ENDPOINT: http://tika:9998 - # ... + # ... - gotenberg: - image: gotenberg/gotenberg:7.6 - restart: unless-stopped - command: - - 'gotenberg' - - '--chromium-disable-routes=true' + gotenberg: + image: gotenberg/gotenberg:7.6 + restart: unless-stopped + # The gotenberg chromium route is used to convert .eml files. We do not + # want to allow external content like tracking pixels or even javascript. + command: + - 'gotenberg' + - '--chromium-disable-javascript=true' + - '--chromium-allow-list=file:///tmp/.*' tika: image: ghcr.io/paperless-ngx/tika:latest diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 53d0e1de3..f522058a5 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -125,13 +125,13 @@ using docker-compose, this is achieved by the following configuration change in the `docker-compose.yml` file: ```yaml -gotenberg: - image: gotenberg/gotenberg:7.6 - restart: unless-stopped - command: - - 'gotenberg' - - '--chromium-disable-routes=true' - - '--api-timeout=60' +# The gotenberg chromium route is used to convert .eml files. We do not +# want to allow external content like tracking pixels or even javascript. +command: + - 'gotenberg' + - '--chromium-disable-javascript=true' + - '--chromium-allow-list=file:///tmp/.*' + - '--api-timeout=60' ``` ## Permission denied errors in the consumption directory diff --git a/scripts/start_services.sh b/scripts/start_services.sh index 97ef0cba7..d53698a27 100755 --- a/scripts/start_services.sh +++ b/scripts/start_services.sh @@ -2,5 +2,5 @@ docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -v paperless_pgdata:/var/lib/postgresql/data -d postgres:13 docker run -d -p 6379:6379 redis:latest -docker run -p 3000:3000 -d gotenberg/gotenberg:7.6 +docker run -p 3000:3000 -d gotenberg/gotenberg:7.6 gotenberg --chromium-disable-javascript=true --chromium-allow-list="file:///tmp/.*" docker run -p 9998:9998 -d ghcr.io/paperless-ngx/tika:latest diff --git a/src/paperless_mail/admin.py b/src/paperless_mail/admin.py index 3b488b153..ce5341e4e 100644 --- a/src/paperless_mail/admin.py +++ b/src/paperless_mail/admin.py @@ -56,6 +56,7 @@ class MailRuleAdmin(admin.ModelAdmin): "filter_body", "filter_attachment_filename", "maximum_age", + "consumption_scope", "attachment_type", ), }, @@ -65,8 +66,8 @@ class MailRuleAdmin(admin.ModelAdmin): { "description": _( "The action applied to the mail. This action is only " - "performed when documents were consumed from the mail. " - "Mails without attachments will remain entirely untouched.", + "performed when the mail body or attachments were " + "consumed from the mail.", ), "fields": ("action", "action_parameter"), }, diff --git a/src/paperless_mail/apps.py b/src/paperless_mail/apps.py index 1c5d656e0..719400e76 100644 --- a/src/paperless_mail/apps.py +++ b/src/paperless_mail/apps.py @@ -1,8 +1,17 @@ from django.apps import AppConfig +from django.conf import settings from django.utils.translation import gettext_lazy as _ +from paperless_mail.signals import mail_consumer_declaration class PaperlessMailConfig(AppConfig): name = "paperless_mail" verbose_name = _("Paperless mail") + + def ready(self): + from documents.signals import document_consumer_declaration + + if settings.TIKA_ENABLED: + document_consumer_declaration.connect(mail_consumer_declaration) + AppConfig.ready(self) diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index 145aebec4..9ac03db6e 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -350,9 +350,16 @@ class MailAccountHandler(LoggingMixin): return total_processed_files - def handle_message(self, message, rule) -> int: - if not message.attachments: - return 0 + def handle_message(self, message, rule: MailRule) -> int: + processed_elements = 0 + + # Skip Message handling when only attachments are to be processed but + # message doesn't have any. + if ( + not message.attachments + and rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY + ): + return processed_elements self.log( "debug", @@ -365,8 +372,41 @@ class MailAccountHandler(LoggingMixin): tag_ids = [tag.id for tag in rule.assign_tags.all()] doc_type = rule.assign_document_type - processed_attachments = 0 + if ( + rule.consumption_scope == MailRule.ConsumptionScope.EML_ONLY + or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING + ): + processed_elements += self.process_eml( + message, + rule, + correspondent, + tag_ids, + doc_type, + ) + if ( + rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY + or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING + ): + processed_elements += self.process_attachments( + message, + rule, + correspondent, + tag_ids, + doc_type, + ) + + return processed_elements + + def process_attachments( + self, + message: MailMessage, + rule: MailRule, + correspondent, + tag_ids, + doc_type, + ): + processed_attachments = 0 for att in message.attachments: if ( @@ -436,5 +476,59 @@ class MailAccountHandler(LoggingMixin): f"since guessed mime type {mime_type} is not supported " f"by paperless", ) - return processed_attachments + + def process_eml( + self, + message: MailMessage, + rule: MailRule, + correspondent, + tag_ids, + doc_type, + ): + os.makedirs(settings.SCRATCH_DIR, exist_ok=True) + _, temp_filename = tempfile.mkstemp( + prefix="paperless-mail-", + dir=settings.SCRATCH_DIR, + suffix=".eml", + ) + with open(temp_filename, "wb") as f: + # Move "From"-header to beginning of file + # TODO: This ugly workaround is needed because the parser is + # chosen only by the mime_type detected via magic + # (see documents/consumer.py "mime_type = magic.from_file") + # Unfortunately magic sometimes fails to detect the mime + # type of .eml files correctly as message/rfc822 and instead + # detects text/plain. + # This also effects direct file consumption of .eml files + # which are not treated with this workaround. + from_element = None + for i, header in enumerate(message.obj._headers): + if header[0] == "From": + from_element = i + if from_element: + new_headers = [message.obj._headers.pop(from_element)] + new_headers += message.obj._headers + message.obj._headers = new_headers + + f.write(message.obj.as_bytes()) + + self.log( + "info", + f"Rule {rule}: " + f"Consuming eml from mail " + f"{message.subject} from {message.from_}", + ) + + consume_file.delay( + path=temp_filename, + override_filename=pathvalidate.sanitize_filename( + message.subject + ".eml", + ), + override_title=message.subject, + override_correspondent_id=correspondent.id if correspondent else None, + override_document_type_id=doc_type.id if doc_type else None, + override_tag_ids=tag_ids, + ) + processed_elements = 1 + return processed_elements diff --git a/src/paperless_mail/migrations/0016_mailrule_consumption_scope.py b/src/paperless_mail/migrations/0016_mailrule_consumption_scope.py new file mode 100644 index 000000000..ea54bce1b --- /dev/null +++ b/src/paperless_mail/migrations/0016_mailrule_consumption_scope.py @@ -0,0 +1,32 @@ +# Generated by Django 4.0.4 on 2022-07-11 22:02 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("paperless_mail", "0015_alter_mailrule_action"), + ] + + operations = [ + migrations.AddField( + model_name="mailrule", + name="consumption_scope", + field=models.PositiveIntegerField( + choices=[ + (1, "Only process attachments."), + ( + 2, + "Process full Mail (with embedded attachments in file) as .eml", + ), + ( + 3, + "Process full Mail (with embedded attachments in file) as .eml + process attachments as separate documents", + ), + ], + default=1, + verbose_name="consumption scope", + ), + ), + ] diff --git a/src/paperless_mail/models.py b/src/paperless_mail/models.py index 4c0a1a557..a7267cc06 100644 --- a/src/paperless_mail/models.py +++ b/src/paperless_mail/models.py @@ -56,6 +56,14 @@ class MailRule(models.Model): verbose_name = _("mail rule") verbose_name_plural = _("mail rules") + class ConsumptionScope(models.IntegerChoices): + ATTACHMENTS_ONLY = 1, _("Only process attachments.") + EML_ONLY = 2, _("Process full Mail (with embedded attachments in file) as .eml") + EVERYTHING = 3, _( + "Process full Mail (with embedded attachments in file) as .eml " + "+ process attachments as separate documents", + ) + class AttachmentProcessing(models.IntegerChoices): ATTACHMENTS_ONLY = 1, _("Only process attachments.") EVERYTHING = 2, _("Process all files, including 'inline' " "attachments.") @@ -145,6 +153,12 @@ class MailRule(models.Model): ), ) + consumption_scope = models.PositiveIntegerField( + _("consumption scope"), + choices=ConsumptionScope.choices, + default=ConsumptionScope.ATTACHMENTS_ONLY, + ) + action = models.PositiveIntegerField( _("action"), choices=MailAction.choices, diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py new file mode 100644 index 000000000..d50217f2e --- /dev/null +++ b/src/paperless_mail/parsers.py @@ -0,0 +1,333 @@ +import os +import re +from html import escape +from io import BytesIO +from io import StringIO + +import requests +from bleach import clean +from bleach import linkify +from django.conf import settings +from documents.parsers import DocumentParser +from documents.parsers import make_thumbnail_from_pdf +from documents.parsers import ParseError +from humanfriendly import format_size +from imap_tools import MailMessage +from tika import parser + + +class MailDocumentParser(DocumentParser): + """ + This parser uses imap_tools to parse .eml files, generates pdf using + gotenbergs and sends the html part to a local tika server for text extraction. + """ + + gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT + tika_server = settings.TIKA_ENDPOINT + + logging_name = "paperless.parsing.mail" + _parsed = None + + def get_parsed(self, document_path) -> MailMessage: + if not self._parsed: + try: + with open(document_path, "rb") as eml: + self._parsed = MailMessage.from_bytes(eml.read()) + except Exception as err: + raise ParseError( + f"Could not parse {document_path}: {err}", + ) + if not self._parsed.from_values: + self._parsed = None + raise ParseError( + f"Could not parse {document_path}: Missing 'from'", + ) + + return self._parsed + + def get_thumbnail(self, document_path, mime_type, file_name=None): + if not self.archive_path: + self.archive_path = self.generate_pdf(document_path) + + return make_thumbnail_from_pdf( + self.archive_path, + self.tempdir, + self.logging_group, + ) + + def extract_metadata(self, document_path, mime_type): + result = [] + + try: + mail = self.get_parsed(document_path) + except ParseError as e: + self.log( + "warning", + f"Error while fetching document metadata for " f"{document_path}: {e}", + ) + return result + + for key, value in mail.headers.items(): + value = ", ".join(i for i in value) + + result.append( + { + "namespace": "", + "prefix": "header", + "key": key, + "value": value, + }, + ) + + result.append( + { + "namespace": "", + "prefix": "", + "key": "attachments", + "value": ", ".join( + f"{attachment.filename}" + f"({format_size(attachment.size, binary=True)})" + for attachment in mail.attachments + ), + }, + ) + + result.append( + { + "namespace": "", + "prefix": "", + "key": "date", + "value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"), + }, + ) + + result.sort(key=lambda item: (item["prefix"], item["key"])) + return result + + def parse(self, document_path, mime_type, file_name=None): + def strip_text(text: str): + text = re.sub(r"\s+", " ", text) + text = re.sub(r"(\n *)+", "\n", text) + return text.strip() + + mail = self.get_parsed(document_path) + + self.text = f"Subject: {mail.subject}\n\n" + self.text += f"From: {mail.from_values.full}\n\n" + self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n" + if len(mail.cc_values) >= 1: + self.text += ( + f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n" + ) + if len(mail.bcc_values) >= 1: + self.text += ( + f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n" + ) + if len(mail.attachments) >= 1: + att = [] + for a in mail.attachments: + att.append(f"{a.filename} ({format_size(a.size, binary=True)})") + + self.text += f"Attachments: {', '.join(att)}\n\n" + + if mail.html != "": + self.text += "HTML content: " + strip_text(self.tika_parse(mail.html)) + + self.text += f"\n\n{strip_text(mail.text)}" + + self.date = mail.date + self.archive_path = self.generate_pdf(document_path) + + def tika_parse(self, html: str): + self.log("info", "Sending content to Tika server") + + try: + parsed = parser.from_buffer(html, self.tika_server) + except Exception as err: + raise ParseError( + f"Could not parse content with tika server at " + f"{self.tika_server}: {err}", + ) + if parsed["content"]: + return parsed["content"] + else: + return "" + + def generate_pdf(self, document_path): + pdf_collection = [] + url_merge = self.gotenberg_server + "/forms/pdfengines/merge" + pdf_path = os.path.join(self.tempdir, "merged.pdf") + mail = self.get_parsed(document_path) + + pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail))) + + if mail.html == "": + with open(pdf_path, "wb") as file: + file.write(pdf_collection[0][1]) + file.close() + return pdf_path + else: + pdf_collection.append( + ( + "2_html.pdf", + self.generate_pdf_from_html(mail.html, mail.attachments), + ), + ) + + files = {} + for name, content in pdf_collection: + files[name] = (name, BytesIO(content)) + headers = {} + try: + response = requests.post(url_merge, files=files, headers=headers) + response.raise_for_status() # ensure we notice bad responses + except Exception as err: + raise ParseError(f"Error while converting document to PDF: {err}") + + with open(pdf_path, "wb") as file: + file.write(response.content) + file.close() + + return pdf_path + + @staticmethod + def mail_to_html(mail: MailMessage) -> StringIO: + data = {} + + def clean_html(text: str): + if isinstance(text, list): + text = "\n".join([str(e) for e in text]) + if type(text) != str: + text = str(text) + text = escape(text) + text = clean(text) + text = linkify(text, parse_email=True) + text = text.replace("\n", "
") + return text + + data["subject"] = clean_html(mail.subject) + if data["subject"] != "": + data["subject_label"] = "Subject" + data["from"] = clean_html(mail.from_values.full) + if data["from"] != "": + data["from_label"] = "From" + data["to"] = clean_html(", ".join(address.full for address in mail.to_values)) + if data["to"] != "": + data["to_label"] = "To" + data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values)) + if data["cc"] != "": + data["cc_label"] = "CC" + data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values)) + if data["bcc"] != "": + data["bcc_label"] = "BCC" + + att = [] + for a in mail.attachments: + att.append(f"{a.filename} ({format_size(a.size, binary=True)})") + data["attachments"] = clean_html(", ".join(att)) + if data["attachments"] != "": + data["attachments_label"] = "Attachments" + + data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M")) + data["content"] = clean_html(mail.text.strip()) + + html = StringIO() + + from django.template.loader import render_to_string + + rendered = render_to_string("email_msg_template.html", context=data) + + html.write(rendered) + html.seek(0) + + return html + + def generate_pdf_from_mail(self, mail): + + url = self.gotenberg_server + "/forms/chromium/convert/html" + self.log("info", "Converting mail to PDF") + + css_file = os.path.join(os.path.dirname(__file__), "templates/output.css") + + with open(css_file, "rb") as css_handle: + + files = { + "html": ("index.html", self.mail_to_html(mail)), + "css": ("output.css", css_handle), + } + headers = {} + data = { + "marginTop": "0.1", + "marginBottom": "0.1", + "marginLeft": "0.1", + "marginRight": "0.1", + "paperWidth": "8.27", + "paperHeight": "11.7", + "scale": "1.0", + } + try: + response = requests.post( + url, + files=files, + headers=headers, + data=data, + ) + response.raise_for_status() # ensure we notice bad responses + except Exception as err: + raise ParseError(f"Error while converting document to PDF: {err}") + + return response.content + + @staticmethod + def transform_inline_html(html, attachments): + def clean_html_script(text: str): + compiled_open = re.compile(re.escape("