Merge pull request #848 from p-h-a-i-l/feature-consume-eml

Feature ability to consume mails and eml files
This commit is contained in:
Trenton H 2022-12-04 16:34:41 -08:00 committed by GitHub
commit 4d4d545343
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
39 changed files with 15511 additions and 38 deletions

View File

@ -106,6 +106,10 @@ jobs:
PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }} PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}
PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }} PAPERLESS_MAIL_TEST_USER: ${{ secrets.TEST_MAIL_USER }}
PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }} PAPERLESS_MAIL_TEST_PASSWD: ${{ secrets.TEST_MAIL_PASSWD }}
# Skip Tests which require convert
PAPERLESS_TEST_SKIP_CONVERT: 1
# Enable Gotenberg end to end testing
GOTENBERG_LIVE: 1
steps: steps:
- -
name: Checkout name: Checkout

View File

@ -60,6 +60,9 @@ setproctitle = "*"
nltk = "*" nltk = "*"
pdf2image = "*" pdf2image = "*"
flower = "*" flower = "*"
bleach = "*"
# https://www.piwheels.org/project/cryptography/ last built version
cryptography = "==38.0.1"
[dev-packages] [dev-packages]
coveralls = "*" coveralls = "*"
@ -76,4 +79,5 @@ black = "*"
pre-commit = "*" pre-commit = "*"
sphinx-autobuild = "*" sphinx-autobuild = "*"
myst-parser = "*" myst-parser = "*"
imagehash = "*"
mkdocs-material = "*" mkdocs-material = "*"

202
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "0242e3e296e09b30fb69e0d7a2f2e8feb4c6a23d3c7ec99500f2883a032a8c84" "sha256": "cbfe9920231de6e7f993962efb3cc371abdb6b08975232d4cf64d1bad1b53d7a"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": {}, "requires": {},
@ -110,6 +110,14 @@
], ],
"version": "==3.6.4.0" "version": "==3.6.4.0"
}, },
"bleach": {
"hashes": [
"sha256:085f7f33c15bd408dd9b17a4ad77c577db66d76203e5984b1bd59baeee948b2a",
"sha256:0d03255c47eb9bd2f26aa9bb7f2107732e7e8fe195ca2f64709fcf3b0a4a085c"
],
"index": "pypi",
"version": "==5.0.1"
},
"celery": { "celery": {
"extras": [ "extras": [
"redis" "redis"
@ -219,7 +227,7 @@
"sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
"sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
], ],
"markers": "python_version >= '3.6'", "markers": "python_full_version >= '3.6.0'",
"version": "==2.1.1" "version": "==2.1.1"
}, },
"click": { "click": {
@ -235,7 +243,7 @@
"sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667", "sha256:a0713dc7a1de3f06bc0df5a9567ad19ead2d3d5689b434768a6145bff77c0667",
"sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035" "sha256:f184f0d851d96b6d29297354ed981b7dd71df7ff500d82fa6d11f0856bee8035"
], ],
"markers": "python_version < '4' and python_full_version >= '3.6.2'", "markers": "python_full_version >= '3.6.2' and python_full_version < '4.0.0'",
"version": "==0.3.0" "version": "==0.3.0"
}, },
"click-plugins": { "click-plugins": {
@ -1625,7 +1633,7 @@
"sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa", "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
"sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e" "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
], ],
"markers": "python_version < '3.10'", "markers": "python_version >= '3.7'",
"version": "==4.4.0" "version": "==4.4.0"
}, },
"tzdata": { "tzdata": {
@ -1767,6 +1775,13 @@
], ],
"version": "==0.2.5" "version": "==0.2.5"
}, },
"webencodings": {
"hashes": [
"sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
"sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
],
"version": "==0.5.1"
},
"websockets": { "websockets": {
"hashes": [ "hashes": [
"sha256:00213676a2e46b6ebf6045bc11d0f529d9120baa6f58d122b4021ad92adabd41", "sha256:00213676a2e46b6ebf6045bc11d0f529d9120baa6f58d122b4021ad92adabd41",
@ -2055,7 +2070,7 @@
"sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
"sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
], ],
"markers": "python_version >= '3.6'", "markers": "python_full_version >= '3.6.0'",
"version": "==2.1.1" "version": "==2.1.1"
}, },
"click": { "click": {
@ -2075,9 +2090,7 @@
"version": "==0.4.6" "version": "==0.4.6"
}, },
"coverage": { "coverage": {
"extras": [ "extras": [],
"toml"
],
"hashes": [ "hashes": [
"sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79", "sha256:027018943386e7b942fa832372ebc120155fd970837489896099f5cfa2890f79",
"sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a", "sha256:11b990d520ea75e7ee8dcab5bc908072aaada194a794db9f6d7d5cfd19661e5a",
@ -2225,6 +2238,14 @@
"markers": "python_version >= '3.5'", "markers": "python_version >= '3.5'",
"version": "==3.4" "version": "==3.4"
}, },
"imagehash": {
"hashes": [
"sha256:5ad9a5cde14fe255745a8245677293ac0d67f09c330986a351f34b614ba62fb5",
"sha256:7038d1b7f9e0585beb3dd8c0a956f02b95a346c0b5f24a9e8cc03ebadaf0aa70"
],
"index": "pypi",
"version": "==4.3.1"
},
"imagesize": { "imagesize": {
"hashes": [ "hashes": [
"sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b",
@ -2395,6 +2416,40 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'",
"version": "==1.7.0" "version": "==1.7.0"
}, },
"numpy": {
"hashes": [
"sha256:0fe563fc8ed9dc4474cbf70742673fc4391d70f4363f917599a7fa99f042d5a8",
"sha256:12ac457b63ec8ded85d85c1e17d85efd3c2b0967ca39560b307a35a6703a4735",
"sha256:2341f4ab6dba0834b685cce16dad5f9b6606ea8a00e6da154f5dbded70fdc4dd",
"sha256:296d17aed51161dbad3c67ed6d164e51fcd18dbcd5dd4f9d0a9c6055dce30810",
"sha256:488a66cb667359534bc70028d653ba1cf307bae88eab5929cd707c761ff037db",
"sha256:4d52914c88b4930dafb6c48ba5115a96cbab40f45740239d9f4159c4ba779962",
"sha256:5e13030f8793e9ee42f9c7d5777465a560eb78fa7e11b1c053427f2ccab90c79",
"sha256:61be02e3bf810b60ab74e81d6d0d36246dbfb644a462458bb53b595791251911",
"sha256:7607b598217745cc40f751da38ffd03512d33ec06f3523fb0b5f82e09f6f676d",
"sha256:7a70a7d3ce4c0e9284e92285cba91a4a3f5214d87ee0e95928f3614a256a1488",
"sha256:7ab46e4e7ec63c8a5e6dbf5c1b9e1c92ba23a7ebecc86c336cb7bf3bd2fb10e5",
"sha256:8981d9b5619569899666170c7c9748920f4a5005bf79c72c07d08c8a035757b0",
"sha256:8c053d7557a8f022ec823196d242464b6955a7e7e5015b719e76003f63f82d0f",
"sha256:926db372bc4ac1edf81cfb6c59e2a881606b409ddc0d0920b988174b2e2a767f",
"sha256:95d79ada05005f6f4f337d3bb9de8a7774f259341c70bc88047a1f7b96a4bcb2",
"sha256:95de7dc7dc47a312f6feddd3da2500826defdccbc41608d0031276a24181a2c0",
"sha256:a0882323e0ca4245eb0a3d0a74f88ce581cc33aedcfa396e415e5bba7bf05f68",
"sha256:a8365b942f9c1a7d0f0dc974747d99dd0a0cdfc5949a33119caf05cb314682d3",
"sha256:a8aae2fb3180940011b4862b2dd3756616841c53db9734b27bb93813cd79fce6",
"sha256:c237129f0e732885c9a6076a537e974160482eab8f10db6292e92154d4c67d71",
"sha256:c67b833dbccefe97cdd3f52798d430b9d3430396af7cdb2a0c32954c3ef73894",
"sha256:ce03305dd694c4873b9429274fd41fc7eb4e0e4dea07e0af97a933b079a5814f",
"sha256:d331afac87c92373826af83d2b2b435f57b17a5c74e6268b79355b970626e329",
"sha256:dada341ebb79619fe00a291185bba370c9803b1e1d7051610e01ed809ef3a4ba",
"sha256:ed2cc92af0efad20198638c69bb0fc2870a58dabfba6eb722c933b48556c686c",
"sha256:f260da502d7441a45695199b4e7fd8ca87db659ba1c78f2bbf31f934fe76ae0e",
"sha256:f2f390aa4da44454db40a1f0201401f9036e8d578a25f01a6e237cea238337ef",
"sha256:f76025acc8e2114bb664294a07ede0727aa75d63a06d2fae96bf29a81747e4a7"
],
"index": "pypi",
"version": "==1.23.4"
},
"packaging": { "packaging": {
"hashes": [ "hashes": [
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb",
@ -2411,6 +2466,73 @@
"markers": "python_version >= '3.7'", "markers": "python_version >= '3.7'",
"version": "==0.10.1" "version": "==0.10.1"
}, },
"pillow": {
"hashes": [
"sha256:03150abd92771742d4a8cd6f2fa6246d847dcd2e332a18d0c15cc75bf6703040",
"sha256:073adb2ae23431d3b9bcbcff3fe698b62ed47211d0716b067385538a1b0f28b8",
"sha256:0b07fffc13f474264c336298d1b4ce01d9c5a011415b79d4ee5527bb69ae6f65",
"sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2",
"sha256:12ce4932caf2ddf3e41d17fc9c02d67126935a44b86df6a206cf0d7161548627",
"sha256:15c42fb9dea42465dfd902fb0ecf584b8848ceb28b41ee2b58f866411be33f07",
"sha256:18498994b29e1cf86d505edcb7edbe814d133d2232d256db8c7a8ceb34d18cef",
"sha256:1c7c8ae3864846fc95f4611c78129301e203aaa2af813b703c55d10cc1628535",
"sha256:22b012ea2d065fd163ca096f4e37e47cd8b59cf4b0fd47bfca6abb93df70b34c",
"sha256:276a5ca930c913f714e372b2591a22c4bd3b81a418c0f6635ba832daec1cbcfc",
"sha256:2e0918e03aa0c72ea56edbb00d4d664294815aa11291a11504a377ea018330d3",
"sha256:3033fbe1feb1b59394615a1cafaee85e49d01b51d54de0cbf6aa8e64182518a1",
"sha256:3168434d303babf495d4ba58fc22d6604f6e2afb97adc6a423e917dab828939c",
"sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa",
"sha256:3dd6caf940756101205dffc5367babf288a30043d35f80936f9bfb37f8355b32",
"sha256:40e1ce476a7804b0fb74bcfa80b0a2206ea6a882938eaba917f7a0f004b42502",
"sha256:41e0051336807468be450d52b8edd12ac60bebaa97fe10c8b660f116e50b30e4",
"sha256:4390e9ce199fc1951fcfa65795f239a8a4944117b5935a9317fb320e7767b40f",
"sha256:502526a2cbfa431d9fc2a079bdd9061a2397b842bb6bc4239bb176da00993812",
"sha256:51e0e543a33ed92db9f5ef69a0356e0b1a7a6b6a71b80df99f1d181ae5875636",
"sha256:57751894f6618fd4308ed8e0c36c333e2f5469744c34729a27532b3db106ee20",
"sha256:5d77adcd56a42d00cc1be30843d3426aa4e660cab4a61021dc84467123f7a00c",
"sha256:655a83b0058ba47c7c52e4e2df5ecf484c1b0b0349805896dd350cbc416bdd91",
"sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe",
"sha256:6c738585d7a9961d8c2821a1eb3dcb978d14e238be3d70f0a706f7fa9316946b",
"sha256:73bd195e43f3fadecfc50c682f5055ec32ee2c933243cafbfdec69ab1aa87cad",
"sha256:772a91fc0e03eaf922c63badeca75e91baa80fe2f5f87bdaed4280662aad25c9",
"sha256:77ec3e7be99629898c9a6d24a09de089fa5356ee408cdffffe62d67bb75fdd72",
"sha256:7db8b751ad307d7cf238f02101e8e36a128a6cb199326e867d1398067381bff4",
"sha256:801ec82e4188e935c7f5e22e006d01611d6b41661bba9fe45b60e7ac1a8f84de",
"sha256:82409ffe29d70fd733ff3c1025a602abb3e67405d41b9403b00b01debc4c9a29",
"sha256:828989c45c245518065a110434246c44a56a8b2b2f6347d1409c787e6e4651ee",
"sha256:829f97c8e258593b9daa80638aee3789b7df9da5cf1336035016d76f03b8860c",
"sha256:871b72c3643e516db4ecf20efe735deb27fe30ca17800e661d769faab45a18d7",
"sha256:89dca0ce00a2b49024df6325925555d406b14aa3efc2f752dbb5940c52c56b11",
"sha256:90fb88843d3902fe7c9586d439d1e8c05258f41da473952aa8b328d8b907498c",
"sha256:97aabc5c50312afa5e0a2b07c17d4ac5e865b250986f8afe2b02d772567a380c",
"sha256:9aaa107275d8527e9d6e7670b64aabaaa36e5b6bd71a1015ddd21da0d4e06448",
"sha256:9f47eabcd2ded7698106b05c2c338672d16a6f2a485e74481f524e2a23c2794b",
"sha256:a0a06a052c5f37b4ed81c613a455a81f9a3a69429b4fd7bb913c3fa98abefc20",
"sha256:ab388aaa3f6ce52ac1cb8e122c4bd46657c15905904b3120a6248b5b8b0bc228",
"sha256:ad58d27a5b0262c0c19b47d54c5802db9b34d38bbf886665b626aff83c74bacd",
"sha256:ae5331c23ce118c53b172fa64a4c037eb83c9165aba3a7ba9ddd3ec9fa64a699",
"sha256:af0372acb5d3598f36ec0914deed2a63f6bcdb7b606da04dc19a88d31bf0c05b",
"sha256:afa4107d1b306cdf8953edde0534562607fe8811b6c4d9a486298ad31de733b2",
"sha256:b03ae6f1a1878233ac620c98f3459f79fd77c7e3c2b20d460284e1fb370557d4",
"sha256:b0915e734b33a474d76c28e07292f196cdf2a590a0d25bcc06e64e545f2d146c",
"sha256:b4012d06c846dc2b80651b120e2cdd787b013deb39c09f407727ba90015c684f",
"sha256:b472b5ea442148d1c3e2209f20f1e0bb0eb556538690fa70b5e1f79fa0ba8dc2",
"sha256:b59430236b8e58840a0dfb4099a0e8717ffb779c952426a69ae435ca1f57210c",
"sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3",
"sha256:b9a65733d103311331875c1dca05cb4606997fd33d6acfed695b1232ba1df193",
"sha256:bac18ab8d2d1e6b4ce25e3424f709aceef668347db8637c2296bcf41acb7cf48",
"sha256:bca31dd6014cb8b0b2db1e46081b0ca7d936f856da3b39744aef499db5d84d02",
"sha256:be55f8457cd1eac957af0c3f5ece7bc3f033f89b114ef30f710882717670b2a8",
"sha256:c7025dce65566eb6e89f56c9509d4f628fddcedb131d9465cacd3d8bac337e7e",
"sha256:c935a22a557a560108d780f9a0fc426dd7459940dc54faa49d83249c8d3e760f",
"sha256:dbb8e7f2abee51cef77673be97760abff1674ed32847ce04b4af90f610144c7b",
"sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74",
"sha256:ebf2029c1f464c59b8bdbe5143c79fa2045a581ac53679733d3a91d400ff9efb",
"sha256:f1ff2ee69f10f13a9596480335f406dd1f70c3650349e2be67ca3139280cade0"
],
"index": "pypi",
"version": "==9.3.0"
},
"platformdirs": { "platformdirs": {
"hashes": [ "hashes": [
"sha256:0cb405749187a194f444c25c82ef7225232f11564721eabffc6ec70df83b11cb", "sha256:0cb405749187a194f444c25c82ef7225232f11564721eabffc6ec70df83b11cb",
@ -2538,6 +2660,37 @@
], ],
"version": "==2022.6" "version": "==2022.6"
}, },
"pywavelets": {
"hashes": [
"sha256:030670a213ee8fefa56f6387b0c8e7d970c7f7ad6850dc048bd7c89364771b9b",
"sha256:058b46434eac4c04dd89aeef6fa39e4b6496a951d78c500b6641fd5b2cc2f9f4",
"sha256:231b0e0b1cdc1112f4af3c24eea7bf181c418d37922a67670e9bf6cfa2d544d4",
"sha256:23bafd60350b2b868076d976bdd92f950b3944f119b4754b1d7ff22b7acbf6c6",
"sha256:3f19327f2129fb7977bc59b966b4974dfd72879c093e44a7287500a7032695de",
"sha256:47cac4fa25bed76a45bc781a293c26ac63e8eaae9eb8f9be961758d22b58649c",
"sha256:578af438a02a86b70f1975b546f68aaaf38f28fb082a61ceb799816049ed18aa",
"sha256:6437af3ddf083118c26d8f97ab43b0724b956c9f958e9ea788659f6a2834ba93",
"sha256:64c6bac6204327321db30b775060fbe8e8642316e6bff17f06b9f34936f88875",
"sha256:67a0d28a08909f21400cb09ff62ba94c064882ffd9e3a6b27880a111211d59bd",
"sha256:71ab30f51ee4470741bb55fc6b197b4a2b612232e30f6ac069106f0156342356",
"sha256:7231461d7a8eb3bdc7aa2d97d9f67ea5a9f8902522818e7e2ead9c2b3408eeb1",
"sha256:754fa5085768227c4f4a26c1e0c78bc509a266d9ebd0eb69a278be7e3ece943c",
"sha256:7ab8d9db0fe549ab2ee0bea61f614e658dd2df419d5b75fba47baa761e95f8f2",
"sha256:875d4d620eee655346e3589a16a73790cf9f8917abba062234439b594e706784",
"sha256:88aa5449e109d8f5e7f0adef85f7f73b1ab086102865be64421a3a3d02d277f4",
"sha256:91d3d393cffa634f0e550d88c0e3f217c96cfb9e32781f2960876f1808d9b45b",
"sha256:9cb5ca8d11d3f98e89e65796a2125be98424d22e5ada360a0dbabff659fca0fc",
"sha256:ab7da0a17822cd2f6545626946d3b82d1a8e106afc4b50e3387719ba01c7b966",
"sha256:ad987748f60418d5f4138db89d82ba0cb49b086e0cbb8fd5c3ed4a814cfb705e",
"sha256:d0e56cd7a53aed3cceca91a04d62feb3a0aca6725b1912d29546c26f6ea90426",
"sha256:d854411eb5ee9cb4bc5d0e66e3634aeb8f594210f6a1bed96dbed57ec70f181c",
"sha256:da7b9c006171be1f9ddb12cc6e0d3d703b95f7f43cb5e2c6f5f15d3233fcf202",
"sha256:daf0aa79842b571308d7c31a9c43bc99a30b6328e6aea3f50388cd8f69ba7dbc",
"sha256:de7cd61a88a982edfec01ea755b0740e94766e00a1ceceeafef3ed4c85c605cd"
],
"markers": "python_version >= '3.8'",
"version": "==1.4.1"
},
"pyyaml": { "pyyaml": {
"hashes": [ "hashes": [
"sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf", "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf",
@ -2599,6 +2752,35 @@
"markers": "python_version >= '3.7' and python_version < '4'", "markers": "python_version >= '3.7' and python_version < '4'",
"version": "==2.28.1" "version": "==2.28.1"
}, },
"scipy": {
"hashes": [
"sha256:02b567e722d62bddd4ac253dafb01ce7ed8742cf8031aea030a41414b86c1125",
"sha256:1166514aa3bbf04cb5941027c6e294a000bba0cf00f5cdac6c77f2dad479b434",
"sha256:1da52b45ce1a24a4a22db6c157c38b39885a990a566748fc904ec9f03ed8c6ba",
"sha256:23b22fbeef3807966ea42d8163322366dd89da9bebdc075da7034cee3a1441ca",
"sha256:28d2cab0c6ac5aa131cc5071a3a1d8e1366dad82288d9ec2ca44df78fb50e649",
"sha256:2ef0fbc8bcf102c1998c1f16f15befe7cffba90895d6e84861cd6c6a33fb54f6",
"sha256:3b69b90c9419884efeffaac2c38376d6ef566e6e730a231e15722b0ab58f0328",
"sha256:4b93ec6f4c3c4d041b26b5f179a6aab8f5045423117ae7a45ba9710301d7e462",
"sha256:4e53a55f6a4f22de01ffe1d2f016e30adedb67a699a310cdcac312806807ca81",
"sha256:6311e3ae9cc75f77c33076cb2794fb0606f14c8f1b1c9ff8ce6005ba2c283621",
"sha256:65b77f20202599c51eb2771d11a6b899b97989159b7975e9b5259594f1d35ef4",
"sha256:6cc6b33139eb63f30725d5f7fa175763dc2df6a8f38ddf8df971f7c345b652dc",
"sha256:70de2f11bf64ca9921fda018864c78af7147025e467ce9f4a11bc877266900a6",
"sha256:70ebc84134cf0c504ce6a5f12d6db92cb2a8a53a49437a6bb4edca0bc101f11c",
"sha256:83606129247e7610b58d0e1e93d2c5133959e9cf93555d3c27e536892f1ba1f2",
"sha256:93d07494a8900d55492401917a119948ed330b8c3f1d700e0b904a578f10ead4",
"sha256:9c4e3ae8a716c8b3151e16c05edb1daf4cb4d866caa385e861556aff41300c14",
"sha256:9dd4012ac599a1e7eb63c114d1eee1bcfc6dc75a29b589ff0ad0bb3d9412034f",
"sha256:9e3fb1b0e896f14a85aa9a28d5f755daaeeb54c897b746df7a55ccb02b340f33",
"sha256:a0aa8220b89b2e3748a2836fbfa116194378910f1a6e78e4675a095bcd2c762d",
"sha256:d3b3c8924252caaffc54d4a99f1360aeec001e61267595561089f8b5900821bb",
"sha256:e013aed00ed776d790be4cb32826adb72799c61e318676172495383ba4570aa4",
"sha256:f3e7a8867f307e3359cc0ed2c63b61a1e33a19080f92fe377bc7d49f646f2ec1"
],
"index": "pypi",
"version": "==1.8.1"
},
"setuptools": { "setuptools": {
"hashes": [ "hashes": [
"sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31", "sha256:d0b9a8433464d5800cbe05094acf5c6d52a91bfac9b52bcfc4d41382be5d5d31",
@ -2714,7 +2896,7 @@
"sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f" "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"
], ],
"markers": "python_version < '3.11' and python_version >= '3.7'", "markers": "python_full_version < '3.11.0a7'",
"version": "==2.0.1" "version": "==2.0.1"
}, },
"tornado": { "tornado": {
@ -2747,7 +2929,7 @@
"sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa", "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa",
"sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e" "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"
], ],
"markers": "python_version < '3.10'", "markers": "python_version >= '3.7'",
"version": "==4.4.0" "version": "==4.4.0"
}, },
"urllib3": { "urllib3": {

View File

@ -11,9 +11,12 @@ services:
container_name: gotenberg container_name: gotenberg
network_mode: host network_mode: host
restart: unless-stopped restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not
# want to allow external content like tracking pixels or even javascript.
command: command:
- "gotenberg" - "gotenberg"
- "--chromium-disable-routes=true" - "--chromium-disable-javascript=true"
- "--chromium-allow-list=file:///tmp/.*"
tika: tika:
image: ghcr.io/paperless-ngx/tika:latest image: ghcr.io/paperless-ngx/tika:latest
hostname: tika hostname: tika

View File

@ -87,9 +87,12 @@ services:
gotenberg: gotenberg:
image: docker.io/gotenberg/gotenberg:7.6 image: docker.io/gotenberg/gotenberg:7.6
restart: unless-stopped restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not
# want to allow external content like tracking pixels or even javascript.
command: command:
- "gotenberg" - "gotenberg"
- "--chromium-disable-routes=true" - "--chromium-disable-javascript=true"
- "--chromium-allow-list=file:///tmp/.*"
tika: tika:
image: ghcr.io/paperless-ngx/tika:latest image: ghcr.io/paperless-ngx/tika:latest

View File

@ -79,9 +79,13 @@ services:
gotenberg: gotenberg:
image: docker.io/gotenberg/gotenberg:7.6 image: docker.io/gotenberg/gotenberg:7.6
restart: unless-stopped restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not
# want to allow external content like tracking pixels or even javascript.
command: command:
- "gotenberg" - "gotenberg"
- "--chromium-disable-routes=true" - "--chromium-disable-javascript=true"
- "--chromium-allow-list=file:///tmp/.*"
tika: tika:
image: ghcr.io/paperless-ngx/tika:latest image: ghcr.io/paperless-ngx/tika:latest

View File

@ -67,9 +67,13 @@ services:
gotenberg: gotenberg:
image: docker.io/gotenberg/gotenberg:7.6 image: docker.io/gotenberg/gotenberg:7.6
restart: unless-stopped restart: unless-stopped
# The gotenberg chromium route is used to convert .eml files. We do not
# want to allow external content like tracking pixels or even javascript.
command: command:
- "gotenberg" - "gotenberg"
- "--chromium-disable-routes=true" - "--chromium-disable-javascript=true"
- "--chromium-allow-list=file:///tmp/.*"
tika: tika:
image: ghcr.io/paperless-ngx/tika:latest image: ghcr.io/paperless-ngx/tika:latest

View File

@ -565,8 +565,10 @@ they use underscores instead of dashes.
Paperless can make use of [Tika](https://tika.apache.org/) and Paperless can make use of [Tika](https://tika.apache.org/) and
[Gotenberg](https://gotenberg.dev/) for parsing and converting [Gotenberg](https://gotenberg.dev/) for parsing and converting
"Office" documents (such as ".doc", ".xlsx" and ".odt"). If you "Office" documents (such as ".doc", ".xlsx" and ".odt").
wish to use this, you must provide a Tika server and a Gotenberg server, Tika and Gotenberg are also needed to allow parsing of E-Mails (.eml).
If you wish to use this, you must provide a Tika server and a Gotenberg server,
configure their endpoints, and enable the feature. configure their endpoints, and enable the feature.
`PAPERLESS_TIKA_ENABLED=<bool>` `PAPERLESS_TIKA_ENABLED=<bool>`
@ -605,14 +607,17 @@ services:
PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000 PAPERLESS_TIKA_GOTENBERG_ENDPOINT: http://gotenberg:3000
PAPERLESS_TIKA_ENDPOINT: http://tika:9998 PAPERLESS_TIKA_ENDPOINT: http://tika:9998
# ... # ...
gotenberg: gotenberg:
image: gotenberg/gotenberg:7.6 image: gotenberg/gotenberg:7.6
restart: unless-stopped restart: unless-stopped
command: # The gotenberg chromium route is used to convert .eml files. We do not
- 'gotenberg' # want to allow external content like tracking pixels or even javascript.
- '--chromium-disable-routes=true' command:
- 'gotenberg'
- '--chromium-disable-javascript=true'
- '--chromium-allow-list=file:///tmp/.*'
tika: tika:
image: ghcr.io/paperless-ngx/tika:latest image: ghcr.io/paperless-ngx/tika:latest

View File

@ -125,13 +125,13 @@ using docker-compose, this is achieved by the following configuration
change in the `docker-compose.yml` file: change in the `docker-compose.yml` file:
```yaml ```yaml
gotenberg: # The gotenberg chromium route is used to convert .eml files. We do not
image: gotenberg/gotenberg:7.6 # want to allow external content like tracking pixels or even javascript.
restart: unless-stopped command:
command: - 'gotenberg'
- 'gotenberg' - '--chromium-disable-javascript=true'
- '--chromium-disable-routes=true' - '--chromium-allow-list=file:///tmp/.*'
- '--api-timeout=60' - '--api-timeout=60'
``` ```
## Permission denied errors in the consumption directory ## Permission denied errors in the consumption directory

View File

@ -2,5 +2,5 @@
docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -v paperless_pgdata:/var/lib/postgresql/data -d postgres:13 docker run -p 5432:5432 -e POSTGRES_PASSWORD=password -v paperless_pgdata:/var/lib/postgresql/data -d postgres:13
docker run -d -p 6379:6379 redis:latest docker run -d -p 6379:6379 redis:latest
docker run -p 3000:3000 -d gotenberg/gotenberg:7.6 docker run -p 3000:3000 -d gotenberg/gotenberg:7.6 gotenberg --chromium-disable-javascript=true --chromium-allow-list="file:///tmp/.*"
docker run -p 9998:9998 -d ghcr.io/paperless-ngx/tika:latest docker run -p 9998:9998 -d ghcr.io/paperless-ngx/tika:latest

View File

@ -56,6 +56,7 @@ class MailRuleAdmin(admin.ModelAdmin):
"filter_body", "filter_body",
"filter_attachment_filename", "filter_attachment_filename",
"maximum_age", "maximum_age",
"consumption_scope",
"attachment_type", "attachment_type",
), ),
}, },
@ -65,8 +66,8 @@ class MailRuleAdmin(admin.ModelAdmin):
{ {
"description": _( "description": _(
"The action applied to the mail. This action is only " "The action applied to the mail. This action is only "
"performed when documents were consumed from the mail. " "performed when the mail body or attachments were "
"Mails without attachments will remain entirely untouched.", "consumed from the mail.",
), ),
"fields": ("action", "action_parameter"), "fields": ("action", "action_parameter"),
}, },

View File

@ -1,8 +1,17 @@
from django.apps import AppConfig from django.apps import AppConfig
from django.conf import settings
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from paperless_mail.signals import mail_consumer_declaration
class PaperlessMailConfig(AppConfig): class PaperlessMailConfig(AppConfig):
name = "paperless_mail" name = "paperless_mail"
verbose_name = _("Paperless mail") verbose_name = _("Paperless mail")
def ready(self):
from documents.signals import document_consumer_declaration
if settings.TIKA_ENABLED:
document_consumer_declaration.connect(mail_consumer_declaration)
AppConfig.ready(self)

View File

@ -350,9 +350,16 @@ class MailAccountHandler(LoggingMixin):
return total_processed_files return total_processed_files
def handle_message(self, message, rule) -> int: def handle_message(self, message, rule: MailRule) -> int:
if not message.attachments: processed_elements = 0
return 0
# Skip Message handling when only attachments are to be processed but
# message doesn't have any.
if (
not message.attachments
and rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY
):
return processed_elements
self.log( self.log(
"debug", "debug",
@ -365,8 +372,41 @@ class MailAccountHandler(LoggingMixin):
tag_ids = [tag.id for tag in rule.assign_tags.all()] tag_ids = [tag.id for tag in rule.assign_tags.all()]
doc_type = rule.assign_document_type doc_type = rule.assign_document_type
processed_attachments = 0 if (
rule.consumption_scope == MailRule.ConsumptionScope.EML_ONLY
or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING
):
processed_elements += self.process_eml(
message,
rule,
correspondent,
tag_ids,
doc_type,
)
if (
rule.consumption_scope == MailRule.ConsumptionScope.ATTACHMENTS_ONLY
or rule.consumption_scope == MailRule.ConsumptionScope.EVERYTHING
):
processed_elements += self.process_attachments(
message,
rule,
correspondent,
tag_ids,
doc_type,
)
return processed_elements
def process_attachments(
self,
message: MailMessage,
rule: MailRule,
correspondent,
tag_ids,
doc_type,
):
processed_attachments = 0
for att in message.attachments: for att in message.attachments:
if ( if (
@ -436,5 +476,59 @@ class MailAccountHandler(LoggingMixin):
f"since guessed mime type {mime_type} is not supported " f"since guessed mime type {mime_type} is not supported "
f"by paperless", f"by paperless",
) )
return processed_attachments return processed_attachments
def process_eml(
self,
message: MailMessage,
rule: MailRule,
correspondent,
tag_ids,
doc_type,
):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
_, temp_filename = tempfile.mkstemp(
prefix="paperless-mail-",
dir=settings.SCRATCH_DIR,
suffix=".eml",
)
with open(temp_filename, "wb") as f:
# Move "From"-header to beginning of file
# TODO: This ugly workaround is needed because the parser is
# chosen only by the mime_type detected via magic
# (see documents/consumer.py "mime_type = magic.from_file")
# Unfortunately magic sometimes fails to detect the mime
# type of .eml files correctly as message/rfc822 and instead
# detects text/plain.
# This also effects direct file consumption of .eml files
# which are not treated with this workaround.
from_element = None
for i, header in enumerate(message.obj._headers):
if header[0] == "From":
from_element = i
if from_element:
new_headers = [message.obj._headers.pop(from_element)]
new_headers += message.obj._headers
message.obj._headers = new_headers
f.write(message.obj.as_bytes())
self.log(
"info",
f"Rule {rule}: "
f"Consuming eml from mail "
f"{message.subject} from {message.from_}",
)
consume_file.delay(
path=temp_filename,
override_filename=pathvalidate.sanitize_filename(
message.subject + ".eml",
),
override_title=message.subject,
override_correspondent_id=correspondent.id if correspondent else None,
override_document_type_id=doc_type.id if doc_type else None,
override_tag_ids=tag_ids,
)
processed_elements = 1
return processed_elements

View File

@ -0,0 +1,32 @@
# Generated by Django 4.0.4 on 2022-07-11 22:02
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0015_alter_mailrule_action"),
]
operations = [
migrations.AddField(
model_name="mailrule",
name="consumption_scope",
field=models.PositiveIntegerField(
choices=[
(1, "Only process attachments."),
(
2,
"Process full Mail (with embedded attachments in file) as .eml",
),
(
3,
"Process full Mail (with embedded attachments in file) as .eml + process attachments as separate documents",
),
],
default=1,
verbose_name="consumption scope",
),
),
]

View File

@ -56,6 +56,14 @@ class MailRule(models.Model):
verbose_name = _("mail rule") verbose_name = _("mail rule")
verbose_name_plural = _("mail rules") verbose_name_plural = _("mail rules")
class ConsumptionScope(models.IntegerChoices):
ATTACHMENTS_ONLY = 1, _("Only process attachments.")
EML_ONLY = 2, _("Process full Mail (with embedded attachments in file) as .eml")
EVERYTHING = 3, _(
"Process full Mail (with embedded attachments in file) as .eml "
"+ process attachments as separate documents",
)
class AttachmentProcessing(models.IntegerChoices): class AttachmentProcessing(models.IntegerChoices):
ATTACHMENTS_ONLY = 1, _("Only process attachments.") ATTACHMENTS_ONLY = 1, _("Only process attachments.")
EVERYTHING = 2, _("Process all files, including 'inline' " "attachments.") EVERYTHING = 2, _("Process all files, including 'inline' " "attachments.")
@ -145,6 +153,12 @@ class MailRule(models.Model):
), ),
) )
consumption_scope = models.PositiveIntegerField(
_("consumption scope"),
choices=ConsumptionScope.choices,
default=ConsumptionScope.ATTACHMENTS_ONLY,
)
action = models.PositiveIntegerField( action = models.PositiveIntegerField(
_("action"), _("action"),
choices=MailAction.choices, choices=MailAction.choices,

View File

@ -0,0 +1,333 @@
import os
import re
from html import escape
from io import BytesIO
from io import StringIO
import requests
from bleach import clean
from bleach import linkify
from django.conf import settings
from documents.parsers import DocumentParser
from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError
from humanfriendly import format_size
from imap_tools import MailMessage
from tika import parser
class MailDocumentParser(DocumentParser):
"""
This parser uses imap_tools to parse .eml files, generates pdf using
gotenbergs and sends the html part to a local tika server for text extraction.
"""
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
tika_server = settings.TIKA_ENDPOINT
logging_name = "paperless.parsing.mail"
_parsed = None
def get_parsed(self, document_path) -> MailMessage:
if not self._parsed:
try:
with open(document_path, "rb") as eml:
self._parsed = MailMessage.from_bytes(eml.read())
except Exception as err:
raise ParseError(
f"Could not parse {document_path}: {err}",
)
if not self._parsed.from_values:
self._parsed = None
raise ParseError(
f"Could not parse {document_path}: Missing 'from'",
)
return self._parsed
def get_thumbnail(self, document_path, mime_type, file_name=None):
if not self.archive_path:
self.archive_path = self.generate_pdf(document_path)
return make_thumbnail_from_pdf(
self.archive_path,
self.tempdir,
self.logging_group,
)
def extract_metadata(self, document_path, mime_type):
result = []
try:
mail = self.get_parsed(document_path)
except ParseError as e:
self.log(
"warning",
f"Error while fetching document metadata for " f"{document_path}: {e}",
)
return result
for key, value in mail.headers.items():
value = ", ".join(i for i in value)
result.append(
{
"namespace": "",
"prefix": "header",
"key": key,
"value": value,
},
)
result.append(
{
"namespace": "",
"prefix": "",
"key": "attachments",
"value": ", ".join(
f"{attachment.filename}"
f"({format_size(attachment.size, binary=True)})"
for attachment in mail.attachments
),
},
)
result.append(
{
"namespace": "",
"prefix": "",
"key": "date",
"value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
},
)
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
def parse(self, document_path, mime_type, file_name=None):
def strip_text(text: str):
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
mail = self.get_parsed(document_path)
self.text = f"Subject: {mail.subject}\n\n"
self.text += f"From: {mail.from_values.full}\n\n"
self.text += f"To: {', '.join(address.full for address in mail.to_values)}\n\n"
if len(mail.cc_values) >= 1:
self.text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if len(mail.bcc_values) >= 1:
self.text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if len(mail.attachments) >= 1:
att = []
for a in mail.attachments:
att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
self.text += f"Attachments: {', '.join(att)}\n\n"
if mail.html != "":
self.text += "HTML content: " + strip_text(self.tika_parse(mail.html))
self.text += f"\n\n{strip_text(mail.text)}"
self.date = mail.date
self.archive_path = self.generate_pdf(document_path)
def tika_parse(self, html: str):
self.log("info", "Sending content to Tika server")
try:
parsed = parser.from_buffer(html, self.tika_server)
except Exception as err:
raise ParseError(
f"Could not parse content with tika server at "
f"{self.tika_server}: {err}",
)
if parsed["content"]:
return parsed["content"]
else:
return ""
def generate_pdf(self, document_path):
pdf_collection = []
url_merge = self.gotenberg_server + "/forms/pdfengines/merge"
pdf_path = os.path.join(self.tempdir, "merged.pdf")
mail = self.get_parsed(document_path)
pdf_collection.append(("1_mail.pdf", self.generate_pdf_from_mail(mail)))
if mail.html == "":
with open(pdf_path, "wb") as file:
file.write(pdf_collection[0][1])
file.close()
return pdf_path
else:
pdf_collection.append(
(
"2_html.pdf",
self.generate_pdf_from_html(mail.html, mail.attachments),
),
)
files = {}
for name, content in pdf_collection:
files[name] = (name, BytesIO(content))
headers = {}
try:
response = requests.post(url_merge, files=files, headers=headers)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}")
with open(pdf_path, "wb") as file:
file.write(response.content)
file.close()
return pdf_path
@staticmethod
def mail_to_html(mail: MailMessage) -> StringIO:
data = {}
def clean_html(text: str):
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if type(text) != str:
text = str(text)
text = escape(text)
text = clean(text)
text = linkify(text, parse_email=True)
text = text.replace("\n", "<br>")
return text
data["subject"] = clean_html(mail.subject)
if data["subject"] != "":
data["subject_label"] = "Subject"
data["from"] = clean_html(mail.from_values.full)
if data["from"] != "":
data["from_label"] = "From"
data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
if data["to"] != "":
data["to_label"] = "To"
data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
if data["cc"] != "":
data["cc_label"] = "CC"
data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
if data["bcc"] != "":
data["bcc_label"] = "BCC"
att = []
for a in mail.attachments:
att.append(f"{a.filename} ({format_size(a.size, binary=True)})")
data["attachments"] = clean_html(", ".join(att))
if data["attachments"] != "":
data["attachments_label"] = "Attachments"
data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M"))
data["content"] = clean_html(mail.text.strip())
html = StringIO()
from django.template.loader import render_to_string
rendered = render_to_string("email_msg_template.html", context=data)
html.write(rendered)
html.seek(0)
return html
def generate_pdf_from_mail(self, mail):
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log("info", "Converting mail to PDF")
css_file = os.path.join(os.path.dirname(__file__), "templates/output.css")
with open(css_file, "rb") as css_handle:
files = {
"html": ("index.html", self.mail_to_html(mail)),
"css": ("output.css", css_handle),
}
headers = {}
data = {
"marginTop": "0.1",
"marginBottom": "0.1",
"marginLeft": "0.1",
"marginRight": "0.1",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
}
try:
response = requests.post(
url,
files=files,
headers=headers,
data=data,
)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}")
return response.content
@staticmethod
def transform_inline_html(html, attachments):
def clean_html_script(text: str):
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
text = compiled_open.sub("<div hidden ", text)
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
text = compiled_close.sub("</div", text)
return text
html_clean = clean_html_script(html)
files = []
for a in attachments:
name_cid = "cid:" + a.content_id
name_clean = "".join(e for e in name_cid if e.isalnum())
files.append((name_clean, BytesIO(a.payload)))
html_clean = html_clean.replace(name_cid, name_clean)
files.append(("index.html", StringIO(html_clean)))
return files
def generate_pdf_from_html(self, orig_html, attachments):
url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log("info", "Converting html to PDF")
files = {}
for name, file in self.transform_inline_html(orig_html, attachments):
files[name] = (name, file)
headers = {}
data = {
"marginTop": "0.1",
"marginBottom": "0.1",
"marginLeft": "0.1",
"marginRight": "0.1",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
}
try:
response = requests.post(
url,
files=files,
headers=headers,
data=data,
)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}")
return response.content

View File

@ -0,0 +1,14 @@
def get_parser(*args, **kwargs):
from .parsers import MailDocumentParser
return MailDocumentParser(*args, **kwargs)
def mail_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 20,
"mime_types": {
"message/rfc822": ".eml",
},
}

View File

@ -0,0 +1,48 @@
{% autoescape off %}
<!doctype html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link href="output.css" rel="stylesheet">
</head>
<body class="bg-white w-screen flex flex-col items-center">
<div class="container max-w-4xl">
<!-- Header -->
<div class="grid gap-x-2 bg-slate-200 p-4">
<div class="col-start-9 col-span-4 row-start-1 text-right">{{ date }}</div>
<div class="col-start-1 row-start-1 text-slate-400 text-right">{{ from_label }}</div>
<div class="col-start-2 col-span-7 row-start-1">{{ from }}</div>
<div class="col-start-1 row-start-2 text-slate-400 text-right">{{ subject_label }}</div>
<div class=" col-start-2 col-span-10 row-start-2 font-bold">{{ subject }}</div>
<div class="col-start-1 row-start-3 text-slate-400 text-right">{{ to_label }}</div>
<div class="col-start-2 col-span-10 row-start-3 text-sm my-0.5">{{ to }}</div>
<div class="col-start-1 row-start-4 text-slate-400 text-right">{{ cc_label }}</div>
<div class="col-start-2 col-span-10 row-start-4 text-sm my-0.5">{{ cc }}</div>
<div class="col-start-1 row-start-5 text-slate-400 text-right">{{ bcc_label }}</div>
<div class="col-start-2 col-span-10 row-start-5" text-sm my-0.5>{{ bcc }}</div>
<div class="col-start-1 row-start-6 text-slate-400 text-right">{{ attachments_label }}</div>
<div class="col-start-2 col-span-10 row-start-6">{{ attachments }}</div>
</div>
<!-- Separator-->
<div class="border-t border-solid border-b w-full h-[1px] box-content border-black mb-5 bg-slate-200"></div>
<!-- Content-->
<div class="w-full break-words">{{ content }}</div>
</div>
</body>
</html>
{% endautoescape %}

View File

@ -0,0 +1,3 @@
@tailwind base;
@tailwind components;
@tailwind utilities;

View File

@ -0,0 +1,706 @@
/*
! tailwindcss v3.0.24 | MIT License | https://tailwindcss.com
*/
/*
1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4)
2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116)
*/
*,
::before,
::after {
box-sizing: border-box;
/* 1 */
border-width: 0;
/* 2 */
border-style: solid;
/* 2 */
border-color: #e5e7eb;
/* 2 */
}
::before,
::after {
--tw-content: '';
}
/*
1. Use a consistent sensible line-height in all browsers.
2. Prevent adjustments of font size after orientation changes in iOS.
3. Use a more readable tab size.
4. Use the user's configured `sans` font-family by default.
*/
html {
line-height: 1.5;
/* 1 */
-webkit-text-size-adjust: 100%;
/* 2 */
-moz-tab-size: 4;
/* 3 */
-o-tab-size: 4;
tab-size: 4;
/* 3 */
font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
/* 4 */
}
/*
1. Remove the margin in all browsers.
2. Inherit line-height from `html` so users can set them as a class directly on the `html` element.
*/
body {
margin: 0;
/* 1 */
line-height: inherit;
/* 2 */
}
/*
1. Add the correct height in Firefox.
2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655)
3. Ensure horizontal rules are visible by default.
*/
hr {
height: 0;
/* 1 */
color: inherit;
/* 2 */
border-top-width: 1px;
/* 3 */
}
/*
Add the correct text decoration in Chrome, Edge, and Safari.
*/
abbr:where([title]) {
-webkit-text-decoration: underline dotted;
text-decoration: underline dotted;
}
/*
Remove the default font size and weight for headings.
*/
h1,
h2,
h3,
h4,
h5,
h6 {
font-size: inherit;
font-weight: inherit;
}
/*
Reset links to optimize for opt-in styling instead of opt-out.
*/
a {
color: inherit;
text-decoration: inherit;
}
/*
Add the correct font weight in Edge and Safari.
*/
b,
strong {
font-weight: bolder;
}
/*
1. Use the user's configured `mono` font family by default.
2. Correct the odd `em` font sizing in all browsers.
*/
code,
kbd,
samp,
pre {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
/* 1 */
font-size: 1em;
/* 2 */
}
/*
Add the correct font size in all browsers.
*/
small {
font-size: 80%;
}
/*
Prevent `sub` and `sup` elements from affecting the line height in all browsers.
*/
sub,
sup {
font-size: 75%;
line-height: 0;
position: relative;
vertical-align: baseline;
}
sub {
bottom: -0.25em;
}
sup {
top: -0.5em;
}
/*
1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297)
2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016)
3. Remove gaps between table borders by default.
*/
table {
text-indent: 0;
/* 1 */
border-color: inherit;
/* 2 */
border-collapse: collapse;
/* 3 */
}
/*
1. Change the font styles in all browsers.
2. Remove the margin in Firefox and Safari.
3. Remove default padding in all browsers.
*/
button,
input,
optgroup,
select,
textarea {
font-family: inherit;
/* 1 */
font-size: 100%;
/* 1 */
line-height: inherit;
/* 1 */
color: inherit;
/* 1 */
margin: 0;
/* 2 */
padding: 0;
/* 3 */
}
/*
Remove the inheritance of text transform in Edge and Firefox.
*/
button,
select {
text-transform: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Remove default button styles.
*/
button,
[type='button'],
[type='reset'],
[type='submit'] {
-webkit-appearance: button;
/* 1 */
background-color: transparent;
/* 2 */
background-image: none;
/* 2 */
}
/*
Use the modern Firefox focus style for all focusable elements.
*/
:-moz-focusring {
outline: auto;
}
/*
Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737)
*/
:-moz-ui-invalid {
box-shadow: none;
}
/*
Add the correct vertical alignment in Chrome and Firefox.
*/
progress {
vertical-align: baseline;
}
/*
Correct the cursor style of increment and decrement buttons in Safari.
*/
::-webkit-inner-spin-button,
::-webkit-outer-spin-button {
height: auto;
}
/*
1. Correct the odd appearance in Chrome and Safari.
2. Correct the outline style in Safari.
*/
[type='search'] {
-webkit-appearance: textfield;
/* 1 */
outline-offset: -2px;
/* 2 */
}
/*
Remove the inner padding in Chrome and Safari on macOS.
*/
::-webkit-search-decoration {
-webkit-appearance: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Change font properties to `inherit` in Safari.
*/
::-webkit-file-upload-button {
-webkit-appearance: button;
/* 1 */
font: inherit;
/* 2 */
}
/*
Add the correct display in Chrome and Safari.
*/
summary {
display: list-item;
}
/*
Removes the default spacing and border for appropriate elements.
*/
blockquote,
dl,
dd,
h1,
h2,
h3,
h4,
h5,
h6,
hr,
figure,
p,
pre {
margin: 0;
}
fieldset {
margin: 0;
padding: 0;
}
legend {
padding: 0;
}
ol,
ul,
menu {
list-style: none;
margin: 0;
padding: 0;
}
/*
Prevent resizing textareas horizontally by default.
*/
textarea {
resize: vertical;
}
/*
1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300)
2. Set the default placeholder color to the user's configured gray 400 color.
*/
input::-moz-placeholder, textarea::-moz-placeholder {
opacity: 1;
/* 1 */
color: #9ca3af;
/* 2 */
}
input:-ms-input-placeholder, textarea:-ms-input-placeholder {
opacity: 1;
/* 1 */
color: #9ca3af;
/* 2 */
}
input::placeholder,
textarea::placeholder {
opacity: 1;
/* 1 */
color: #9ca3af;
/* 2 */
}
/*
Set the default cursor for buttons.
*/
button,
[role="button"] {
cursor: pointer;
}
/*
Make sure disabled buttons don't get the pointer cursor.
*/
:disabled {
cursor: default;
}
/*
1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14)
2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210)
This can trigger a poorly considered lint error in some tools but is included by design.
*/
img,
svg,
video,
canvas,
audio,
iframe,
embed,
object {
display: block;
/* 1 */
vertical-align: middle;
/* 2 */
}
/*
Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14)
*/
img,
video {
max-width: 100%;
height: auto;
}
/*
Ensure the default browser behavior of the `hidden` attribute.
*/
[hidden] {
display: none;
}
*, ::before, ::after {
--tw-translate-x: 0;
--tw-translate-y: 0;
--tw-rotate: 0;
--tw-skew-x: 0;
--tw-skew-y: 0;
--tw-scale-x: 1;
--tw-scale-y: 1;
--tw-pan-x: ;
--tw-pan-y: ;
--tw-pinch-zoom: ;
--tw-scroll-snap-strictness: proximity;
--tw-ordinal: ;
--tw-slashed-zero: ;
--tw-numeric-figure: ;
--tw-numeric-spacing: ;
--tw-numeric-fraction: ;
--tw-ring-inset: ;
--tw-ring-offset-width: 0px;
--tw-ring-offset-color: #fff;
--tw-ring-color: rgb(59 130 246 / 0.5);
--tw-ring-offset-shadow: 0 0 #0000;
--tw-ring-shadow: 0 0 #0000;
--tw-shadow: 0 0 #0000;
--tw-shadow-colored: 0 0 #0000;
--tw-blur: ;
--tw-brightness: ;
--tw-contrast: ;
--tw-grayscale: ;
--tw-hue-rotate: ;
--tw-invert: ;
--tw-saturate: ;
--tw-sepia: ;
--tw-drop-shadow: ;
--tw-backdrop-blur: ;
--tw-backdrop-brightness: ;
--tw-backdrop-contrast: ;
--tw-backdrop-grayscale: ;
--tw-backdrop-hue-rotate: ;
--tw-backdrop-invert: ;
--tw-backdrop-opacity: ;
--tw-backdrop-saturate: ;
--tw-backdrop-sepia: ;
}
.container {
width: 100%;
}
@media (min-width: 640px) {
.container {
max-width: 640px;
}
}
@media (min-width: 768px) {
.container {
max-width: 768px;
}
}
@media (min-width: 1024px) {
.container {
max-width: 1024px;
}
}
@media (min-width: 1280px) {
.container {
max-width: 1280px;
}
}
@media (min-width: 1536px) {
.container {
max-width: 1536px;
}
}
.col-span-2 {
grid-column: span 2 / span 2;
}
.col-span-8 {
grid-column: span 8 / span 8;
}
.col-span-10 {
grid-column: span 10 / span 10;
}
.col-span-3 {
grid-column: span 3 / span 3;
}
.col-span-4 {
grid-column: span 4 / span 4;
}
.col-span-7 {
grid-column: span 7 / span 7;
}
.col-start-11 {
grid-column-start: 11;
}
.col-start-1 {
grid-column-start: 1;
}
.col-start-2 {
grid-column-start: 2;
}
.col-start-10 {
grid-column-start: 10;
}
.col-start-9 {
grid-column-start: 9;
}
.row-start-1 {
grid-row-start: 1;
}
.row-start-2 {
grid-row-start: 2;
}
.row-start-3 {
grid-row-start: 3;
}
.row-start-4 {
grid-row-start: 4;
}
.row-start-5 {
grid-row-start: 5;
}
.row-start-6 {
grid-row-start: 6;
}
.my-1 {
margin-top: 0.25rem;
margin-bottom: 0.25rem;
}
.my-0\.5 {
margin-top: 0.125rem;
margin-bottom: 0.125rem;
}
.my-0 {
margin-top: 0px;
margin-bottom: 0px;
}
.mb-5 {
margin-bottom: 1.25rem;
}
.box-content {
box-sizing: content-box;
}
.flex {
display: flex;
}
.grid {
display: grid;
}
.h-\[1px\] {
height: 1px;
}
.w-screen {
width: 100vw;
}
.w-full {
width: 100%;
}
.max-w-4xl {
max-width: 56rem;
}
.grid-cols-12 {
grid-template-columns: repeat(12, minmax(0, 1fr));
}
.grid-rows-5 {
grid-template-rows: repeat(5, minmax(0, 1fr));
}
.flex-col {
flex-direction: column;
}
.items-center {
align-items: center;
}
.gap-x-2 {
-moz-column-gap: 0.5rem;
column-gap: 0.5rem;
}
.whitespace-pre-line {
white-space: pre-line;
}
.break-words {
overflow-wrap: break-word;
}
.border-t {
border-top-width: 1px;
}
.border-b {
border-bottom-width: 1px;
}
.border-solid {
border-style: solid;
}
.border-black {
--tw-border-opacity: 1;
border-color: rgb(0 0 0 / var(--tw-border-opacity));
}
.bg-white {
--tw-bg-opacity: 1;
background-color: rgb(255 255 255 / var(--tw-bg-opacity));
}
.bg-slate-200 {
--tw-bg-opacity: 1;
background-color: rgb(226 232 240 / var(--tw-bg-opacity));
}
.p-4 {
padding: 1rem;
}
.text-right {
text-align: right;
}
.text-sm {
font-size: 0.875rem;
line-height: 1.25rem;
}
.font-bold {
font-weight: 700;
}
.text-slate-400 {
--tw-text-opacity: 1;
color: rgb(148 163 184 / var(--tw-text-opacity));
}
.text-blue-600 {
--tw-text-opacity: 1;
color: rgb(37 99 235 / var(--tw-text-opacity));
}
.underline {
-webkit-text-decoration-line: underline;
text-decoration-line: underline;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,5 @@
{
"devDependencies": {
"tailwindcss": "^3.0.24"
}
}

View File

@ -0,0 +1,7 @@
module.exports = {
content: ['./*.html'],
theme: {
extend: {},
},
plugins: [],
}

View File

@ -0,0 +1 @@
This is not a valid eml.

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,45 @@
<!doctype html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link href="output.css" rel="stylesheet">
</head>
<body class="bg-white w-screen flex flex-col items-center">
<div class="container max-w-4xl">
<!-- Header -->
<div class="grid gap-x-2 bg-slate-200 p-4">
<div class="col-start-9 col-span-4 row-start-1 text-right">2022-10-15 09:23</div>
<div class="col-start-1 row-start-1 text-slate-400 text-right">From</div>
<div class="col-start-2 col-span-7 row-start-1">Name &lt;<a href="mailto:someone@example.de">someone@example.de</a>&gt;</div>
<div class="col-start-1 row-start-2 text-slate-400 text-right">Subject</div>
<div class=" col-start-2 col-span-10 row-start-2 font-bold">HTML Message</div>
<div class="col-start-1 row-start-3 text-slate-400 text-right">To</div>
<div class="col-start-2 col-span-10 row-start-3 text-sm my-0.5"><a href="mailto:someone@example.de">someone@example.de</a></div>
<div class="col-start-1 row-start-4 text-slate-400 text-right"></div>
<div class="col-start-2 col-span-10 row-start-4 text-sm my-0.5"></div>
<div class="col-start-1 row-start-5 text-slate-400 text-right"></div>
<div class="col-start-2 col-span-10 row-start-5" text-sm my-0.5></div>
<div class="col-start-1 row-start-6 text-slate-400 text-right">Attachments</div>
<div class="col-start-2 col-span-10 row-start-6">IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)</div>
</div>
<!-- Separator-->
<div class="border-t border-solid border-b w-full h-[1px] box-content border-black mb-5 bg-slate-200"></div>
<!-- Content-->
<div class="w-full break-words">Some Text<br><br>and an embedded image.</div>
</div>
</body>
</html>

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.0 KiB

View File

@ -0,0 +1,19 @@
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
</head>
<body>
<p>Some Text</p>
<p>
<img src="cid:part1.pNdUSz0s.D3NqVtPg@example.de" alt="Has to be rewritten to work..">
<img src="https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png" alt="This image should not be shown.">
</p>
<p>and an embedded image.<br>
</p>
<p id="changeme">Paragraph unchanged.</p>
<scRipt>
document.getElementById("changeme").innerHTML = "Paragraph changed via Java Script.";
</script>
</body>
</html>

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.9 KiB

Binary file not shown.

View File

@ -0,0 +1,25 @@
Return-Path: <mail@someserver.de>
Delivered-To: mail@someserver.de
Received: from mail.someserver.org ([::1])
by e1acdba3bd07 with LMTP
id KBKZGD2YR2NTCgQAjubtDA
(envelope-from <mail@someserver.de>)
for <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200
Received: from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616
for <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)
Message-ID: <6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>
Date: Wed, 12 Oct 2022 21:40:43 +0200
MIME-Version: 1.0
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
Thunderbird/102.3.1
Content-Language: en-US
To: some@one.de
Cc: asdasd@æsdasd.de, asdadasdasdasda.asdasd@æsdasd.de
Bcc: fdf@fvf.de
From: Some One <mail@someserver.de>
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 7bit
X-Last-TLS-Session-Version: TLSv1.3
Subject: Simple Text Mail
This is just a simple Text Mail.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

View File

@ -0,0 +1,688 @@
import datetime
import os
from unittest import mock
from django.test import TestCase
from documents.parsers import ParseError
from paperless_mail.parsers import MailDocumentParser
class TestParser(TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def setUp(self) -> None:
self.parser = MailDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
def test_get_parsed_missing_file(self):
"""
GIVEN:
- Fresh parser
WHEN:
- A nonexistent file should be parsed
THEN:
- An Exception is thrown
"""
# Check if exception is raised when parsing fails.
self.assertRaises(
ParseError,
self.parser.get_parsed,
os.path.join(self.SAMPLE_FILES, "na"),
)
def test_get_parsed_broken_file(self):
"""
GIVEN:
- Fresh parser
WHEN:
- A faulty file should be parsed
THEN:
- An Exception is thrown
"""
# Check if exception is raised when the mail is faulty.
self.assertRaises(
ParseError,
self.parser.get_parsed,
os.path.join(self.SAMPLE_FILES, "broken.eml"),
)
def test_get_parsed_simple_text_mail(self):
"""
GIVEN:
- Fresh parser
WHEN:
- A .eml file should be parsed
THEN:
- The content of the mail should be available in the parse result.
"""
# Parse Test file and check relevant content
parsed1 = self.parser.get_parsed(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
)
self.assertEqual(parsed1.date.year, 2022)
self.assertEqual(parsed1.date.month, 10)
self.assertEqual(parsed1.date.day, 12)
self.assertEqual(parsed1.date.hour, 21)
self.assertEqual(parsed1.date.minute, 40)
self.assertEqual(parsed1.date.second, 43)
self.assertEqual(parsed1.date.tzname(), "UTC+02:00")
self.assertEqual(parsed1.from_, "mail@someserver.de")
self.assertEqual(parsed1.subject, "Simple Text Mail")
self.assertEqual(parsed1.text, "This is just a simple Text Mail.\n")
self.assertEqual(parsed1.to, ("some@one.de",))
def test_get_parsed_reparse(self):
"""
GIVEN:
- An E-Mail was parsed
WHEN:
- Another .eml file should be parsed
THEN:
- The parser should not retry to parse and return the old results
"""
# Parse Test file and check relevant content
parsed1 = self.parser.get_parsed(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
)
# Check if same parsed object as before is returned, even if another file is given.
parsed2 = self.parser.get_parsed(
os.path.join(os.path.join(self.SAMPLE_FILES, "html.eml")),
)
self.assertEqual(parsed1, parsed2)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
@mock.patch("paperless_mail.parsers.make_thumbnail_from_pdf")
def test_get_thumbnail(
self,
mock_make_thumbnail_from_pdf: mock.MagicMock,
mock_generate_pdf: mock.MagicMock,
):
"""
GIVEN:
- An E-Mail was parsed
WHEN:
- The Thumbnail is requested
THEN:
- The parser should call the functions which generate the thumbnail
"""
mocked_return = "Passing the return value through.."
mock_make_thumbnail_from_pdf.return_value = mocked_return
mock_generate_pdf.return_value = "Mocked return value.."
thumb = self.parser.get_thumbnail(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
"message/rfc822",
)
self.assertEqual(
self.parser.archive_path,
mock_make_thumbnail_from_pdf.call_args_list[0].args[0],
)
self.assertEqual(
self.parser.tempdir,
mock_make_thumbnail_from_pdf.call_args_list[0].args[1],
)
self.assertEqual(mocked_return, thumb)
@mock.patch("documents.loggers.LoggingMixin.log")
def test_extract_metadata_fail(self, m: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- Metadata extraction is triggered for nonexistent file
THEN:
- A log warning should be generated
"""
# Validate if warning is logged when parsing fails
self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822"))
self.assertEqual("warning", m.call_args[0][0])
def test_extract_metadata(self):
"""
GIVEN:
- Fresh start
WHEN:
- Metadata extraction is triggered
THEN:
- metadata is returned
"""
# Validate Metadata parsing returns the expected results
metadata = self.parser.extract_metadata(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
"message/rfc822",
)
self.assertIn(
{"namespace": "", "prefix": "", "key": "attachments", "value": ""},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "",
"key": "date",
"value": "2022-10-12 21:40:43 UTC+02:00",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "content-language",
"value": "en-US",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "content-type",
"value": "text/plain; charset=UTF-8; format=flowed",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "date",
"value": "Wed, 12 Oct 2022 21:40:43 +0200",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "delivered-to",
"value": "mail@someserver.de",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "from",
"value": "Some One <mail@someserver.de>",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "message-id",
"value": "<6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de>",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "mime-version",
"value": "1.0",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "received",
"value": "from mail.someserver.org ([::1])\n\tby e1acdba3bd07 with LMTP\n\tid KBKZGD2YR2NTCgQAjubtDA\n\t(envelope-from <mail@someserver.de>)\n\tfor <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616\n\tfor <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST)",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "return-path",
"value": "<mail@someserver.de>",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "subject",
"value": "Simple Text Mail",
},
metadata,
)
self.assertIn(
{"namespace": "", "prefix": "header", "key": "to", "value": "some@one.de"},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "user-agent",
"value": "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101\n Thunderbird/102.3.1",
},
metadata,
)
self.assertIn(
{
"namespace": "",
"prefix": "header",
"key": "x-last-tls-session-version",
"value": "TLSv1.3",
},
metadata,
)
def test_parse_na(self):
"""
GIVEN:
- Fresh start
WHEN:
- parsing is attempted with nonexistent file
THEN:
- Exception is thrown
"""
# Check if exception is raised when parsing fails.
self.assertRaises(
ParseError,
self.parser.parse,
os.path.join(self.SAMPLE_FILES, "na"),
"message/rfc822",
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.tika_parse")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_parse_html_eml(self, n, mock_tika_parse: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- parsing is done with html mail
THEN:
- Tika is called, parsed information from non html parts is available
"""
# Validate parsing returns the expected results
text_expected = "Subject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return\n\nSome Text and an embedded image."
mock_tika_parse.return_value = "tika return"
self.parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822")
self.assertEqual(text_expected, self.parser.text)
self.assertEqual(
datetime.datetime(
2022,
10,
15,
11,
23,
19,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
),
self.parser.date,
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_parse_simple_eml(self, n):
"""
GIVEN:
- Fresh start
WHEN:
- parsing is done with non html mail
THEN:
- parsed information is available
"""
# Validate parsing returns the expected results
self.parser.parse(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
"message/rfc822",
)
text_expected = "Subject: Simple Text Mail\n\nFrom: Some One <mail@someserver.de>\n\nTo: some@one.de\n\nCC: asdasd@æsdasd.de, asdadasdasdasda.asdasd@æsdasd.de\n\nBCC: fdf@fvf.de\n\n\n\nThis is just a simple Text Mail."
self.assertEqual(text_expected, self.parser.text)
self.assertEqual(
datetime.datetime(
2022,
10,
12,
21,
40,
43,
tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
),
self.parser.date,
)
# Just check if file exists, the unittest for generate_pdf() goes deeper.
self.assertTrue(os.path.isfile(self.parser.archive_path))
@mock.patch("paperless_mail.parsers.parser.from_buffer")
def test_tika_parse_unsuccessful(self, mock_from_buffer: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- tika parsing fails
THEN:
- the parser should return an empty string
"""
# Check unsuccessful parsing
mock_from_buffer.return_value = {"content": None}
parsed = self.parser.tika_parse(None)
self.assertEqual("", parsed)
@mock.patch("paperless_mail.parsers.parser.from_buffer")
def test_tika_parse(self, mock_from_buffer: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- tika parsing is called
THEN:
- a web request to tika shall be done and the reply es returned
"""
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
expected_text = "Some Text"
# Check successful parsing
mock_from_buffer.return_value = {"content": expected_text}
parsed = self.parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip())
mock_from_buffer.assert_called_with(html, self.parser.tika_server)
@mock.patch("paperless_mail.parsers.parser.from_buffer")
def test_tika_parse_exception(self, mock_from_buffer: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- tika parsing is called and an exception is thrown on the request
THEN:
- a ParseError Exception is thrown
"""
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
# Check ParseError
def my_side_effect():
raise Exception("Test")
mock_from_buffer.side_effect = my_side_effect
self.assertRaises(ParseError, self.parser.tika_parse, html)
def test_tika_parse_unreachable(self):
"""
GIVEN:
- Fresh start
WHEN:
- tika parsing is called but tika is not available
THEN:
- a ParseError Exception is thrown
"""
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
# Check if exception is raised when Tika cannot be reached.
self.parser.tika_server = ""
self.assertRaises(ParseError, self.parser.tika_parse, html)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf_parse_error(self, m: mock.MagicMock, n: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation is requested but gotenberg can not be reached
THEN:
- a ParseError Exception is thrown
"""
m.return_value = b""
n.return_value = b""
# Check if exception is raised when the pdf can not be created.
self.parser.gotenberg_server = ""
self.assertRaises(
ParseError,
self.parser.generate_pdf,
os.path.join(self.SAMPLE_FILES, "html.eml"),
)
def test_generate_pdf_exception(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation is requested but parsing throws an exception
THEN:
- a ParseError Exception is thrown
"""
# Check if exception is raised when the mail can not be parsed.
self.assertRaises(
ParseError,
self.parser.generate_pdf,
os.path.join(self.SAMPLE_FILES, "broken.eml"),
)
@mock.patch("paperless_mail.parsers.requests.post")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf(
self,
mock_generate_pdf_from_html: mock.MagicMock,
mock_generate_pdf_from_mail: mock.MagicMock,
mock_post: mock.MagicMock,
):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation is requested
THEN:
- gotenberg is called and the resulting file is returned
"""
mock_generate_pdf_from_mail.return_value = b"Mail Return"
mock_generate_pdf_from_html.return_value = b"HTML Return"
mock_response = mock.MagicMock()
mock_response.content = b"Content"
mock_post.return_value = mock_response
pdf_path = self.parser.generate_pdf(os.path.join(self.SAMPLE_FILES, "html.eml"))
self.assertTrue(os.path.isfile(pdf_path))
mock_generate_pdf_from_mail.assert_called_once_with(
self.parser.get_parsed(None),
)
mock_generate_pdf_from_html.assert_called_once_with(
self.parser.get_parsed(None).html,
self.parser.get_parsed(None).attachments,
)
self.assertEqual(
self.parser.gotenberg_server + "/forms/pdfengines/merge",
mock_post.call_args.args[0],
)
self.assertEqual({}, mock_post.call_args.kwargs["headers"])
self.assertEqual(
b"Mail Return",
mock_post.call_args.kwargs["files"]["1_mail.pdf"][1].read(),
)
self.assertEqual(
b"HTML Return",
mock_post.call_args.kwargs["files"]["2_html.pdf"][1].read(),
)
mock_response.raise_for_status.assert_called_once()
with open(pdf_path, "rb") as file:
self.assertEqual(b"Content", file.read())
def test_mail_to_html(self):
"""
GIVEN:
- Fresh start
WHEN:
- conversion from eml to html is requested
THEN:
- html should be returned
"""
mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml"))
html_handle = self.parser.mail_to_html(mail)
html_received = html_handle.read()
with open(
os.path.join(self.SAMPLE_FILES, "html.eml.html"),
) as html_expected_handle:
html_expected = html_expected_handle.read()
self.assertHTMLEqual(html_expected, html_received)
@mock.patch("paperless_mail.parsers.requests.post")
@mock.patch("paperless_mail.parsers.MailDocumentParser.mail_to_html")
def test_generate_pdf_from_mail(
self,
mock_mail_to_html: mock.MagicMock,
mock_post: mock.MagicMock,
):
"""
GIVEN:
- Fresh start
WHEN:
- conversion of PDF from .eml is requested
THEN:
- gotenberg should be called with valid intermediary html files, the resulting pdf is returned
"""
mock_response = mock.MagicMock()
mock_response.content = b"Content"
mock_post.return_value = mock_response
mock_mail_to_html.return_value = "Testresponse"
mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml"))
retval = self.parser.generate_pdf_from_mail(mail)
self.assertEqual(b"Content", retval)
mock_mail_to_html.assert_called_once_with(mail)
self.assertEqual(
self.parser.gotenberg_server + "/forms/chromium/convert/html",
mock_post.call_args.args[0],
)
self.assertEqual({}, mock_post.call_args.kwargs["headers"])
self.assertEqual(
{
"marginTop": "0.1",
"marginBottom": "0.1",
"marginLeft": "0.1",
"marginRight": "0.1",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
},
mock_post.call_args.kwargs["data"],
)
self.assertEqual(
"Testresponse",
mock_post.call_args.kwargs["files"]["html"][1],
)
self.assertEqual(
"output.css",
mock_post.call_args.kwargs["files"]["css"][0],
)
mock_response.raise_for_status.assert_called_once()
def test_transform_inline_html(self):
"""
GIVEN:
- Fresh start
WHEN:
- transforming of html content from an email with an inline image attachment is requested
THEN:
- html is returned and sanitized
"""
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
result = None
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
html = html_file.read()
png = png_file.read()
attachments = [
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
]
result = self.parser.transform_inline_html(html, attachments)
resulting_html = result[-1][1].read()
self.assertTrue(result[-1][0] == "index.html")
self.assertIn(result[0][0], resulting_html)
self.assertNotIn("<script", resulting_html.lower())
@mock.patch("paperless_mail.parsers.requests.post")
def test_generate_pdf_from_html(self, mock_post: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- generating pdf from html with inline attachments is attempted
THEN:
- gotenberg is called with the correct parameters and the resulting pdf is returned
"""
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
mock_response = mock.MagicMock()
mock_response.content = b"Content"
mock_post.return_value = mock_response
result = None
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
html = html_file.read()
png = png_file.read()
attachments = [
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
]
result = self.parser.generate_pdf_from_html(html, attachments)
self.assertEqual(
self.parser.gotenberg_server + "/forms/chromium/convert/html",
mock_post.call_args.args[0],
)
self.assertEqual({}, mock_post.call_args.kwargs["headers"])
self.assertEqual(
{
"marginTop": "0.1",
"marginBottom": "0.1",
"marginLeft": "0.1",
"marginRight": "0.1",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
},
mock_post.call_args.kwargs["data"],
)
# read to assert it is a file like object.
mock_post.call_args.kwargs["files"]["cidpart1pNdUSz0sD3NqVtPgexamplede"][
1
].read()
mock_post.call_args.kwargs["files"]["index.html"][1].read()
mock_response.raise_for_status.assert_called_once()
self.assertEqual(b"Content", result)

View File

@ -0,0 +1,361 @@
import os
from unittest import mock
from urllib.error import HTTPError
from urllib.request import urlopen
import pytest
from django.test import TestCase
from documents.parsers import ParseError
from documents.parsers import run_convert
from imagehash import average_hash
from paperless_mail.parsers import MailDocumentParser
from pdfminer.high_level import extract_text
from PIL import Image
class TestParserLive(TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def setUp(self) -> None:
self.parser = MailDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
@staticmethod
def imagehash(file, hash_size=18):
return f"{average_hash(Image.open(file), hash_size)}"
# Only run if convert is available
@pytest.mark.skipif(
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
def test_get_thumbnail(self, mock_generate_pdf: mock.MagicMock):
"""
GIVEN:
- Fresh start
WHEN:
- The Thumbnail is requested
THEN:
- The returned thumbnail image file is as expected
"""
mock_generate_pdf.return_value = os.path.join(
self.SAMPLE_FILES,
"simple_text.eml.pdf",
)
thumb = self.parser.get_thumbnail(
os.path.join(self.SAMPLE_FILES, "simple_text.eml"),
"message/rfc822",
)
self.assertTrue(os.path.isfile(thumb))
expected = os.path.join(self.SAMPLE_FILES, "simple_text.eml.pdf.webp")
self.assertEqual(
self.imagehash(thumb),
self.imagehash(expected),
f"Created Thumbnail {thumb} differs from expected file {expected}",
)
@pytest.mark.skipif(
"TIKA_LIVE" not in os.environ,
reason="No tika server",
)
def test_tika_parse_successful(self):
"""
GIVEN:
- Fresh start
WHEN:
- tika parsing is called
THEN:
- a web request to tika shall be done and the reply es returned
"""
html = '<html><head><meta http-equiv="content-type" content="text/html; charset=UTF-8"></head><body><p>Some Text</p></body></html>'
expected_text = "Some Text"
# Check successful parsing
parsed = self.parser.tika_parse(html)
self.assertEqual(expected_text, parsed.strip())
@pytest.mark.skipif(
"TIKA_LIVE" not in os.environ,
reason="No tika server",
)
def test_tika_parse_unsuccessful(self):
"""
GIVEN:
- Fresh start
WHEN:
- tika parsing fails
THEN:
- the parser should return an empty string
"""
# Check unsuccessful parsing
parsed = self.parser.tika_parse(None)
self.assertEqual("", parsed)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail")
@mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html")
def test_generate_pdf_gotenberg_merging(
self,
mock_generate_pdf_from_html: mock.MagicMock,
mock_generate_pdf_from_mail: mock.MagicMock,
):
"""
GIVEN:
- Intermediary pdfs to be merged
WHEN:
- pdf generation is requested with html file requiring merging of pdfs
THEN:
- gotenberg is called to merge files and the resulting file is returned
"""
with open(os.path.join(self.SAMPLE_FILES, "first.pdf"), "rb") as first:
mock_generate_pdf_from_mail.return_value = first.read()
with open(os.path.join(self.SAMPLE_FILES, "second.pdf"), "rb") as second:
mock_generate_pdf_from_html.return_value = second.read()
pdf_path = self.parser.generate_pdf(os.path.join(self.SAMPLE_FILES, "html.eml"))
self.assertTrue(os.path.isfile(pdf_path))
extracted = extract_text(pdf_path)
expected = (
"first\tPDF\tto\tbe\tmerged.\n\n\x0csecond\tPDF\tto\tbe\tmerged.\n\n\x0c"
)
self.assertEqual(expected, extracted)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_generate_pdf_from_mail_no_convert(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from simple eml file is requested
THEN:
- gotenberg is called and the resulting file is returned and contains the expected text.
"""
mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml"))
pdf_path = os.path.join(self.parser.tempdir, "html.eml.pdf")
with open(pdf_path, "wb") as file:
file.write(self.parser.generate_pdf_from_mail(mail))
extracted = extract_text(pdf_path)
expected = extract_text(os.path.join(self.SAMPLE_FILES, "html.eml.pdf"))
self.assertEqual(expected, extracted)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
# Only run if convert is available
@pytest.mark.skipif(
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
)
def test_generate_pdf_from_mail(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from simple eml file is requested
THEN:
- gotenberg is called and the resulting file is returned and look as expected.
"""
mail = self.parser.get_parsed(os.path.join(self.SAMPLE_FILES, "html.eml"))
pdf_path = os.path.join(self.parser.tempdir, "html.eml.pdf")
with open(pdf_path, "wb") as file:
file.write(self.parser.generate_pdf_from_mail(mail))
converted = os.path.join(
self.parser.tempdir,
"html.eml.pdf.webp",
)
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{pdf_path}", # Do net define an index to convert all pages.
output_file=converted,
logging_group=None,
)
self.assertTrue(os.path.isfile(converted))
thumb_hash = self.imagehash(converted)
# The created pdf is not reproducible. But the converted image should always look the same.
expected_hash = self.imagehash(
os.path.join(self.SAMPLE_FILES, "html.eml.pdf.webp"),
)
self.assertEqual(
thumb_hash,
expected_hash,
f"PDF looks different. Check if {converted} looks weird.",
)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_generate_pdf_from_html_no_convert(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from html eml file is requested
THEN:
- gotenberg is called and the resulting file is returned and contains the expected text.
"""
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
result = None
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
html = html_file.read()
png = png_file.read()
attachments = [
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
]
result = self.parser.generate_pdf_from_html(html, attachments)
pdf_path = os.path.join(self.parser.tempdir, "sample.html.pdf")
with open(pdf_path, "wb") as file:
file.write(result)
extracted = extract_text(pdf_path)
expected = extract_text(os.path.join(self.SAMPLE_FILES, "sample.html.pdf"))
self.assertEqual(expected, extracted)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
# Only run if convert is available
@pytest.mark.skipif(
"PAPERLESS_TEST_SKIP_CONVERT" in os.environ,
reason="PAPERLESS_TEST_SKIP_CONVERT set, skipping Test",
)
def test_generate_pdf_from_html(self):
"""
GIVEN:
- Fresh start
WHEN:
- pdf generation from html eml file is requested
THEN:
- gotenberg is called and the resulting file is returned and look as expected.
"""
class MailAttachmentMock:
def __init__(self, payload, content_id):
self.payload = payload
self.content_id = content_id
result = None
with open(os.path.join(self.SAMPLE_FILES, "sample.html")) as html_file:
with open(os.path.join(self.SAMPLE_FILES, "sample.png"), "rb") as png_file:
html = html_file.read()
png = png_file.read()
attachments = [
MailAttachmentMock(png, "part1.pNdUSz0s.D3NqVtPg@example.de"),
]
result = self.parser.generate_pdf_from_html(html, attachments)
pdf_path = os.path.join(self.parser.tempdir, "sample.html.pdf")
with open(pdf_path, "wb") as file:
file.write(result)
converted = os.path.join(self.parser.tempdir, "sample.html.pdf.webp")
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{pdf_path}", # Do net define an index to convert all pages.
output_file=converted,
logging_group=None,
)
self.assertTrue(os.path.isfile(converted))
thumb_hash = self.imagehash(converted)
# The created pdf is not reproducible. But the converted image should always look the same.
expected_hash = self.imagehash(
os.path.join(self.SAMPLE_FILES, "sample.html.pdf.webp"),
)
self.assertEqual(
thumb_hash,
expected_hash,
f"PDF looks different. Check if {converted} looks weird. "
f"If Rick Astley is shown, Gotenberg loads from web which is bad for Mail content.",
)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_online_image_exception_on_not_available(self):
"""
GIVEN:
- Fresh start
WHEN:
- nonexistent image is requested
THEN:
- An exception shall be thrown
"""
"""
A public image is used in the html sample file. We have no control
whether this image stays online forever, so here we check if we can detect if is not
available anymore.
"""
# Start by Testing if nonexistent URL really throws an Exception
self.assertRaises(
HTTPError,
urlopen,
"https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
)
@pytest.mark.skipif(
"GOTENBERG_LIVE" not in os.environ,
reason="No gotenberg server",
)
def test_is_online_image_still_available(self):
"""
GIVEN:
- Fresh start
WHEN:
- A public image used in the html sample file is requested
THEN:
- No exception shall be thrown
"""
"""
A public image is used in the html sample file. We have no control
whether this image stays online forever, so here we check if it is still there
"""
# Now check the URL used in samples/sample.html
urlopen("https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png")