some more tests.

This commit is contained in:
jonaswinkler 2020-12-01 14:15:43 +01:00
parent 24b8c358cc
commit fd3df1ec58
5 changed files with 441 additions and 55 deletions

298
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "d266e1f67e3090ec68aa8ecba1e8373351daf89ad5a5ab46524d123bcaf29f62" "sha256": "55c9136777e78d6cd362628cd1fc0c5ff36b437699b92089ce504d598004371d"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -44,6 +44,94 @@
], ],
"version": "==1.17.12" "version": "==1.17.12"
}, },
"cffi": {
"hashes": [
"sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e",
"sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d",
"sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a",
"sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec",
"sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362",
"sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668",
"sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c",
"sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b",
"sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654",
"sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06",
"sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698",
"sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2",
"sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c",
"sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7",
"sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009",
"sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03",
"sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b",
"sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909",
"sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53",
"sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35",
"sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26",
"sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b",
"sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb",
"sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293",
"sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd",
"sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d",
"sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3",
"sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5",
"sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d",
"sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca",
"sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d",
"sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775",
"sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375",
"sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b",
"sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b",
"sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f"
],
"version": "==1.14.4"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"markers": "python_version >= '3.1'",
"version": "==3.0.4"
},
"coloredlogs": {
"hashes": [
"sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a",
"sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505",
"sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==14.0"
},
"cryptography": {
"hashes": [
"sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538",
"sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f",
"sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d",
"sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77",
"sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b",
"sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33",
"sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e",
"sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb",
"sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e",
"sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b",
"sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7",
"sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297",
"sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d",
"sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7",
"sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b",
"sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7",
"sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4",
"sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8",
"sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b",
"sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851",
"sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13",
"sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b",
"sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3",
"sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.2.1"
},
"dateparser": { "dateparser": {
"hashes": [ "hashes": [
"sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b", "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b",
@ -123,6 +211,14 @@
"index": "pypi", "index": "pypi",
"version": "==20.0.4" "version": "==20.0.4"
}, },
"humanfriendly": {
"hashes": [
"sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12",
"sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==8.2"
},
"imap-tools": { "imap-tools": {
"hashes": [ "hashes": [
"sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246", "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246",
@ -131,6 +227,13 @@
"index": "pypi", "index": "pypi",
"version": "==0.32.0" "version": "==0.32.0"
}, },
"img2pdf": {
"hashes": [
"sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5",
"sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9"
],
"version": "==0.4.0"
},
"inotify-simple": { "inotify-simple": {
"hashes": [ "hashes": [
"sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128", "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128",
@ -164,6 +267,51 @@
"index": "pypi", "index": "pypi",
"version": "==1.0.8" "version": "==1.0.8"
}, },
"lxml": {
"hashes": [
"sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
"sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
"sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
"sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
"sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
"sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
"sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
"sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
"sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
"sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
"sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
"sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
"sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
"sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
"sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
"sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
"sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
"sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
"sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
"sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
"sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
"sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
"sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
"sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
"sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
"sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
"sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
"sha256:91d6dace31b07ab47eeadd3f4384ded2f77b94b30446410cb2c3e660e047f7a7",
"sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
"sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
"sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
"sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
"sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
"sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
"sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
"sha256:e1dbb88a937126ab14d219a000728224702e0ec0fc7ceb7131c53606b7a76772",
"sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
"sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
"sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.6.2"
},
"numpy": { "numpy": {
"hashes": [ "hashes": [
"sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db", "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db",
@ -205,6 +353,14 @@
"markers": "python_version >= '3.6'", "markers": "python_version >= '3.6'",
"version": "==1.19.4" "version": "==1.19.4"
}, },
"ocrmypdf": {
"hashes": [
"sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3",
"sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928"
],
"index": "pypi",
"version": "==11.3.3"
},
"pathtools": { "pathtools": {
"hashes": [ "hashes": [
"sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0", "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0",
@ -220,6 +376,14 @@
"index": "pypi", "index": "pypi",
"version": "==2.3.0" "version": "==2.3.0"
}, },
"pdfminer.six": {
"hashes": [
"sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a",
"sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509"
],
"markers": "python_version >= '3.4'",
"version": "==20201018"
},
"pdftotext": { "pdftotext": {
"hashes": [ "hashes": [
"sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6" "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6"
@ -227,6 +391,33 @@
"index": "pypi", "index": "pypi",
"version": "==2.1.5" "version": "==2.1.5"
}, },
"pikepdf": {
"hashes": [
"sha256:0829bd5dacd73bb4a37e7575bae523f49603479755563c92ddb55c206700cab1",
"sha256:0d2b631077cd6af6e4d1b396208020705842610a6f13fab489d5f9c47916baa2",
"sha256:21c98af08fae4ac9fbcad02b613b6768a4ca300fda4cba867f4a4b6f73c2d04b",
"sha256:2240372fed30124ddc35b0c15a613f2b687a426ea2f150091e0a0c58cca7a495",
"sha256:2a97f5f1403e058d217d7f6861cf51fca200c5687bce0d052f5f2fa89b5bfa22",
"sha256:3faaefca0ae80d19891acec8b0dd5e6235f59f2206d82375eb80d090285e9557",
"sha256:48ef45b64882901c0d69af3b85d16a19bd0f3e95b43e614fefb53521d8caf36c",
"sha256:5212fe41f2323fc7356ba67caa39737fe13080562cff37bcbb74a8094076c8d0",
"sha256:56859c32170663c57bd0658189ce44e180533eebe813853446cd6413810be9eb",
"sha256:5f8fd1cb3478c5534222018aca24fbbd2bc74460c899bda988ec76722c13caa9",
"sha256:74300a32c41b3d578772f6933f23a88b19f74484185e71e5225ce2f7ea5aea78",
"sha256:8cbc946bdd217148f4a9c029fcea62f4ae0f67d5346de4c865f4718cd0ddc37f",
"sha256:9ceefd30076f732530cf84a1be2ecb2fa9931af932706ded760a6d37c73b96ad",
"sha256:ad69c170fda41b07a4c6b668a3128e7a759f50d9aebcfcde0ccff1358abe0423",
"sha256:b715fe182189fb6870fab5b0383bb2fb278c88c46eade346b0f4c1ed8818c09d",
"sha256:bb01ecf95083ffcb9ad542dc5342ccc1059e46f1395fd966629d36d9cc766b4a",
"sha256:bd6328547219cf48cefb4e0a1bc54442910594de1c5a5feae847d9ff3c629031",
"sha256:edb128379bb1dea76b5bdbdacf5657a6e4754bacc2049640762725590d8ed905",
"sha256:f8e687900557fcd4c51b4e72b9e337fdae9e2c81049d1d80b624bb2e88b5769d",
"sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
"sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
],
"markers": "python_version < '3.9'",
"version": "==2.2.0"
},
"pillow": { "pillow": {
"hashes": [ "hashes": [
"sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a", "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a",
@ -262,6 +453,14 @@
"index": "pypi", "index": "pypi",
"version": "==8.0.1" "version": "==8.0.1"
}, },
"pluggy": {
"hashes": [
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.13.1"
},
"psycopg2-binary": { "psycopg2-binary": {
"hashes": [ "hashes": [
"sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c", "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c",
@ -305,13 +504,13 @@
"index": "pypi", "index": "pypi",
"version": "==2.8.6" "version": "==2.8.6"
}, },
"pyocr": { "pycparser": {
"hashes": [ "hashes": [
"sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179", "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
"sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301" "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
], ],
"index": "pypi", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.7.2" "version": "==2.20"
}, },
"python-dateutil": { "python-dateutil": {
"hashes": [ "hashes": [
@ -419,6 +618,53 @@
], ],
"version": "==2020.11.13" "version": "==2020.11.13"
}, },
"reportlab": {
"hashes": [
"sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540",
"sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c",
"sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030",
"sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f",
"sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5",
"sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e",
"sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89",
"sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2",
"sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f",
"sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969",
"sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53",
"sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d",
"sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d",
"sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457",
"sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec",
"sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f",
"sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778",
"sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92",
"sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6",
"sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077",
"sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673",
"sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b",
"sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48",
"sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95",
"sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94",
"sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589",
"sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9",
"sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8",
"sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d",
"sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32",
"sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b",
"sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a",
"sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2",
"sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967",
"sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5",
"sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437",
"sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8",
"sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab",
"sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e",
"sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a",
"sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8",
"sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08"
],
"version": "==3.5.55"
},
"scikit-learn": { "scikit-learn": {
"hashes": [ "hashes": [
"sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e", "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e",
@ -482,6 +728,13 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.15.0" "version": "==1.15.0"
}, },
"sortedcontainers": {
"hashes": [
"sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f",
"sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1"
],
"version": "==2.3.0"
},
"sqlparse": { "sqlparse": {
"hashes": [ "hashes": [
"sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0", "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
@ -498,6 +751,14 @@
"markers": "python_version >= '3.5'", "markers": "python_version >= '3.5'",
"version": "==2.1.0" "version": "==2.1.0"
}, },
"tqdm": {
"hashes": [
"sha256:5c0d04e06ccc0da1bd3fa5ae4550effcce42fcad947b4a6cafa77bdc9b09ff22",
"sha256:9e7b8ab0ecbdbf0595adadd5f0ebbb9e69010e0bd48bbb0c15e550bf2a5292df"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==4.54.0"
},
"tzlocal": { "tzlocal": {
"hashes": [ "hashes": [
"sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44", "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
@ -589,6 +850,7 @@
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
], ],
"markers": "python_version >= '3.1'",
"version": "==3.0.4" "version": "==3.0.4"
}, },
"coverage": { "coverage": {
@ -711,22 +973,6 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.2.0" "version": "==1.2.0"
}, },
"importlib-metadata": {
"hashes": [
"sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132",
"sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c"
],
"markers": "python_version < '3.8'",
"version": "==2.1.0"
},
"importlib-resources": {
"hashes": [
"sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592",
"sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5"
],
"markers": "python_version < '3.7'",
"version": "==3.3.0"
},
"iniconfig": { "iniconfig": {
"hashes": [ "hashes": [
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@ -1038,14 +1284,6 @@
], ],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==20.2.1" "version": "==20.2.1"
},
"zipp": {
"hashes": [
"sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
"sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
],
"markers": "python_version < '3.8'",
"version": "==3.4.0"
} }
} }
} }

View File

@ -177,11 +177,13 @@ PAPERLESS_OCR_LANGUAGE=<lang>
Defaults to "eng". Defaults to "eng".
PAPERLESS_OCR_MODE=<mode> PAPERLESS_OCR_MODE=<mode>
Tell paperless when and how to perform ocr on your documents. Three modes Tell paperless when and how to perform ocr on your documents. Four modes
are available: are available:
* ``skip``: Paperless skips all pages and will perform ocr only on pages * ``skip``: Paperless skips all pages and will perform ocr only on pages
where no text is present. This is the safest and fastest option. where no text is present. This is the safest and fastest option.
* ``skip_noarchive``: In addition to skip, paperless won't create an
archived version of your documents when it finds any text in them.
* ``redo``: Paperless will OCR all pages of your documents and attempt to * ``redo``: Paperless will OCR all pages of your documents and attempt to
replace any existing text layers with new text. This will be useful for replace any existing text layers with new text. This will be useful for
documents from scanners that already performed OCR with insufficient documents from scanners that already performed OCR with insufficient

View File

@ -10,7 +10,6 @@ def create_source_path_directory(source_path):
os.makedirs(os.path.dirname(source_path), exist_ok=True) os.makedirs(os.path.dirname(source_path), exist_ok=True)
# TODO: also make this work for archive dir
def delete_empty_directories(directory, root): def delete_empty_directories(directory, root):
if not os.path.isdir(directory): if not os.path.isdir(directory):
return return

View File

@ -2,32 +2,17 @@ import os
import shutil import shutil
from pathlib import Path from pathlib import Path
from unittest import mock from unittest import mock
from uuid import uuid4
from django.conf import settings from django.conf import settings
from django.db import DatabaseError from django.db import DatabaseError
from django.test import TestCase, override_settings from django.test import TestCase, override_settings
from .utils import DirectoriesMixin
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
from ..models import Document, Correspondent from ..models import Document, Correspondent
class TestDate(TestCase): class TestFileHandling(DirectoriesMixin, TestCase):
deletion_list = []
def add_to_deletion_list(self, dirname):
self.deletion_list.append(dirname)
def setUp(self):
folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
os.makedirs(folder + "/documents/originals")
override_settings(MEDIA_ROOT=folder).enable()
override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
self.add_to_deletion_list(folder)
def tearDown(self):
for dirname in self.deletion_list:
shutil.rmtree(dirname, ignore_errors=True)
@override_settings(PAPERLESS_FILENAME_FORMAT="") @override_settings(PAPERLESS_FILENAME_FORMAT="")
def test_generate_source_filename(self): def test_generate_source_filename(self):
@ -104,7 +89,7 @@ class TestDate(TestCase):
document.save() document.save()
# Check proper handling of files # Check proper handling of files
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
@ -140,7 +125,7 @@ class TestDate(TestCase):
# Check proper handling of files # Check proper handling of files
self.assertTrue(os.path.isfile(document.source_path)) self.assertTrue(os.path.isfile(document.source_path))
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
@ -196,8 +181,8 @@ class TestDate(TestCase):
document.save() document.save()
# Check proper handling of files # Check proper handling of files
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True) self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True) self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
self.assertTrue(os.path.isfile(important_file)) self.assertTrue(os.path.isfile(important_file))
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
@ -315,7 +300,6 @@ class TestDate(TestCase):
# Create our working directory # Create our working directory
tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty") tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
os.makedirs(tmp) os.makedirs(tmp)
self.add_to_deletion_list(tmp)
os.makedirs(os.path.join(tmp, "notempty")) os.makedirs(os.path.join(tmp, "notempty"))
Path(os.path.join(tmp, "notempty", "file")).touch() Path(os.path.join(tmp, "notempty", "file")).touch()
@ -345,3 +329,159 @@ class TestDate(TestCase):
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf") self.assertEqual(generate_filename(document), "0000001.pdf")
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
def test_create_no_format(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_create_with_format(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertFalse(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_move_archive_gone(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
#Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertFalse(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_move_archive_exists(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.os.rename")
def test_move_archive_error(self, m):
def fake_rename(src, dst):
if "archive" in src:
raise OSError()
else:
os.remove(src)
Path(dst).touch()
m.side_effect = fake_rename
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_move_file_gone(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
#Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertFalse(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertFalse(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
@mock.patch("documents.signals.handlers.os.rename")
def test_move_file_error(self, m):
def fake_rename(src, dst):
if "original" in src:
raise OSError()
else:
os.remove(src)
Path(dst).touch()
m.side_effect = fake_rename
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
def test_archive_deleted(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))
doc.delete()
self.assertFalse(os.path.isfile(original))
self.assertFalse(os.path.isfile(archive))
self.assertFalse(os.path.isfile(doc.source_path))
self.assertFalse(os.path.isfile(doc.archive_path))
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
def test_database_error(self):
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
Path(original).touch()
Path(archive).touch()
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
m.side_effect = DatabaseError()
doc.save()
self.assertTrue(os.path.isfile(original))
self.assertTrue(os.path.isfile(archive))
self.assertTrue(os.path.isfile(doc.source_path))
self.assertTrue(os.path.isfile(doc.archive_path))

View File

@ -80,6 +80,12 @@ class RasterisedDocumentParser(DocumentParser):
return None return None
def parse(self, document_path, mime_type): def parse(self, document_path, mime_type):
if settings.OCR_MODE == "skip_noarchive":
text = get_text_from_pdf(document_path)
if text and len(text) > 50:
self.text = text
return
archive_path = os.path.join(self.tempdir, "archive.pdf") archive_path = os.path.join(self.tempdir, "archive.pdf")
ocr_args = { ocr_args = {
@ -96,7 +102,7 @@ class RasterisedDocumentParser(DocumentParser):
if settings.OCR_PAGES > 0: if settings.OCR_PAGES > 0:
ocr_args['pages'] = f"1-{settings.OCR_PAGES}" ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
if settings.OCR_MODE == 'skip': if settings.OCR_MODE in ['skip', 'skip_noarchive']:
ocr_args['skip_text'] = True ocr_args['skip_text'] = True
elif settings.OCR_MODE == 'redo': elif settings.OCR_MODE == 'redo':
ocr_args['redo_ocr'] = True ocr_args['redo_ocr'] = True
@ -184,6 +190,7 @@ def get_text_from_pdf(pdf_file):
try: try:
pdf = pdftotext.PDF(f) pdf = pdftotext.PDF(f)
except pdftotext.Error: except pdftotext.Error:
# might not be a PDF file
return None return None
text = "\n".join(pdf) text = "\n".join(pdf)