reworked PDF parser that uses OCRmyPDF and produces archive files.

This commit is contained in:
Jonas Winkler 2020-11-25 14:50:43 +01:00
parent 95ec520f13
commit 2d559d330d
7 changed files with 374 additions and 186 deletions

View File

@ -23,7 +23,6 @@ langdetect = "*"
pdftotext = "*"
pathvalidate = "*"
pillow = "*"
pyocr = "~=0.7.2"
python-gnupg = "*"
python-dotenv = "*"
python-dateutil = "*"
@ -35,6 +34,7 @@ scikit-learn="~=0.23.2"
whitenoise = "~=5.2.0"
watchdog = "*"
whoosh="~=2.7.4"
ocrmypdf = "*"
[dev-packages]
coveralls = "*"

292
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "ae2643b9cf0cf5741ae149fb6bc0c480de41329ce48e773eb4b5d760bc5e2244"
"sha256": "cf1c008df0080c01273c032aef59bd841e4f27b66beaf3fa459665a7a7a4fcc4"
},
"pipfile-spec": 6,
"requires": {},
@ -42,6 +42,94 @@
],
"version": "==1.17.11"
},
"cffi": {
"hashes": [
"sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e",
"sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d",
"sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a",
"sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec",
"sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362",
"sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668",
"sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c",
"sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b",
"sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654",
"sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06",
"sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698",
"sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2",
"sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c",
"sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7",
"sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009",
"sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03",
"sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b",
"sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909",
"sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53",
"sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35",
"sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26",
"sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b",
"sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb",
"sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293",
"sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd",
"sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d",
"sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3",
"sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5",
"sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d",
"sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca",
"sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d",
"sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775",
"sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375",
"sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b",
"sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b",
"sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f"
],
"version": "==1.14.4"
},
"chardet": {
"hashes": [
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"markers": "python_version >= '3.1'",
"version": "==3.0.4"
},
"coloredlogs": {
"hashes": [
"sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a",
"sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505",
"sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==14.0"
},
"cryptography": {
"hashes": [
"sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538",
"sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f",
"sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d",
"sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77",
"sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b",
"sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33",
"sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e",
"sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb",
"sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e",
"sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b",
"sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7",
"sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297",
"sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d",
"sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7",
"sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b",
"sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7",
"sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4",
"sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8",
"sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b",
"sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851",
"sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13",
"sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b",
"sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3",
"sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.2.1"
},
"dateparser": {
"hashes": [
"sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b",
@ -121,6 +209,14 @@
"index": "pypi",
"version": "==20.0.4"
},
"humanfriendly": {
"hashes": [
"sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12",
"sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==8.2"
},
"imap-tools": {
"hashes": [
"sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246",
@ -129,6 +225,13 @@
"index": "pypi",
"version": "==0.32.0"
},
"img2pdf": {
"hashes": [
"sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5",
"sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9"
],
"version": "==0.4.0"
},
"joblib": {
"hashes": [
"sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
@ -146,6 +249,51 @@
"index": "pypi",
"version": "==1.0.8"
},
"lxml": {
"hashes": [
"sha256:098fb713b31050463751dcc694878e1d39f316b86366fb9fe3fbbe5396ac9fab",
"sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b",
"sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5",
"sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301",
"sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b",
"sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d",
"sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b",
"sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9",
"sha256:288ddf94d9d0488187f578fdcc1868af2a6fe6714444c8259b68a83fa27b76d2",
"sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b",
"sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311",
"sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891",
"sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a",
"sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1",
"sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856",
"sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810",
"sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51",
"sha256:4e006fdb434609956a8f710ffffe650afab414dc43728786ebdbdca48e179b14",
"sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360",
"sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4",
"sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f",
"sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230",
"sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a",
"sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f",
"sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174",
"sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf",
"sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd",
"sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3",
"sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a",
"sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5",
"sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367",
"sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c",
"sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1",
"sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8",
"sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f",
"sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc",
"sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d",
"sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9",
"sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==4.6.1"
},
"numpy": {
"hashes": [
"sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db",
@ -187,6 +335,14 @@
"markers": "python_version >= '3.6'",
"version": "==1.19.4"
},
"ocrmypdf": {
"hashes": [
"sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3",
"sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928"
],
"index": "pypi",
"version": "==11.3.3"
},
"pathtools": {
"hashes": [
"sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0",
@ -202,6 +358,14 @@
"index": "pypi",
"version": "==2.3.0"
},
"pdfminer.six": {
"hashes": [
"sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a",
"sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509"
],
"markers": "python_version >= '3.4'",
"version": "==20201018"
},
"pdftotext": {
"hashes": [
"sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6"
@ -209,6 +373,33 @@
"index": "pypi",
"version": "==2.1.5"
},
"pikepdf": {
"hashes": [
"sha256:0dd42f791f29e7e2ab120103605b9ddd65937c773a72d21341a56873a89e76c9",
"sha256:12a1d243143cf972ce11def50f0bd1f6e630f5e660cdeddb2c7c49db5adad40a",
"sha256:2e1713af11b71e95c2d218c10d68b6f8e813be19c8596c560f3c84617f6d5437",
"sha256:2f90acad26d9939193946eb6ca8363fd3cf44b46b5c1409468906618bccb8113",
"sha256:3c482fe30fd58ff385795605a9233f37f97fb83427c3e829b1a568a2a3b59f60",
"sha256:3ddabfc33a8a7cecba76c1685ce5125fdf239a38d0854d7c2a703490b5783773",
"sha256:61dd3f13b7416111d19bf493ce4e7281f63a1dd22c532200cbbcd65813ea43e4",
"sha256:6ce42b7780835fb52452ccaff3a3ac1b28ae1f9d80faab59c559045d9fcb211d",
"sha256:6dba75782f108ebbf3947fcb29ea0ba7da0482868e53f6602643adc36245201d",
"sha256:716427a5c0372f3cc7dc282c4b49d49d8d5182a3e937739a4c3632151e74d6a4",
"sha256:730ef4013099da7ea722a9b5659260097af6f47ddfa3c2abab4d4493de2591f3",
"sha256:73e14bba4135adfb89ae2f2163369bd788ecf23839acc8d062d832118f07e288",
"sha256:84df07acc8968051da33891af55a3ab1aa55453d83df4ce9b84d821eedc34583",
"sha256:8f739e9c660d71cd479f11f9aa110857cf0d0d9c2472f40bbcbaf02f980355a1",
"sha256:a20ca7adbb9d3da416cf5f6de0ebca53855f9a3b99acdd6ec864c61482894d71",
"sha256:bc58d9486c0959619a2584e558a54d36468c6d1165cd9fe0bfb1ecc3e6b33c6a",
"sha256:c0627930a17b3a5e1a7c9109099535259afc50fe006a05af9c3634de05abd318",
"sha256:de5f445eaaadd7dae56e1043ab8ca5eef49ece302a4e37e1fc6d21b7dcfcfb1b",
"sha256:de6aae7782db33f2cc71c9ba63b7e2ec0e0529843c065eac4e71fcbe043426e2",
"sha256:e2efd844c09f8ce3103a93bfbd54983542a0a63c88bdc0f0cdbb2997f99a147d",
"sha256:fdb481ad1219e8d667625afd2f01b26f98df079e4f66e7e49816ec20c8d8c401"
],
"markers": "python_version < '3.9'",
"version": "==2.1.2"
},
"pillow": {
"hashes": [
"sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a",
@ -244,6 +435,14 @@
"index": "pypi",
"version": "==8.0.1"
},
"pluggy": {
"hashes": [
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==0.13.1"
},
"psycopg2-binary": {
"hashes": [
"sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c",
@ -287,13 +486,13 @@
"index": "pypi",
"version": "==2.8.6"
},
"pyocr": {
"pycparser": {
"hashes": [
"sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179",
"sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301"
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
],
"index": "pypi",
"version": "==0.7.2"
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.20"
},
"python-dateutil": {
"hashes": [
@ -401,6 +600,53 @@
],
"version": "==2020.11.13"
},
"reportlab": {
"hashes": [
"sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540",
"sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c",
"sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030",
"sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f",
"sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5",
"sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e",
"sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89",
"sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2",
"sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f",
"sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969",
"sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53",
"sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d",
"sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d",
"sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457",
"sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec",
"sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f",
"sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778",
"sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92",
"sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6",
"sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077",
"sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673",
"sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b",
"sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48",
"sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95",
"sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94",
"sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589",
"sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9",
"sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8",
"sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d",
"sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32",
"sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b",
"sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a",
"sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2",
"sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967",
"sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5",
"sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437",
"sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8",
"sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab",
"sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e",
"sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a",
"sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8",
"sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08"
],
"version": "==3.5.55"
},
"scikit-learn": {
"hashes": [
"sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e",
@ -464,6 +710,13 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.15.0"
},
"sortedcontainers": {
"hashes": [
"sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f",
"sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1"
],
"version": "==2.3.0"
},
"sqlparse": {
"hashes": [
"sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
@ -480,6 +733,14 @@
"markers": "python_version >= '3.5'",
"version": "==2.1.0"
},
"tqdm": {
"hashes": [
"sha256:3d3f1470d26642e88bd3f73353cb6ff4c51ef7d5d7efef763238f4bc1f7e4e81",
"sha256:5ff3f5232b19fa4c5531641e480b7fad4598819f708a32eb815e6ea41c5fa313"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==4.53.0"
},
"tzlocal": {
"hashes": [
"sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
@ -489,11 +750,11 @@
},
"watchdog": {
"hashes": [
"sha256:034c85530b647486e8c8477410fe79476511282658f2ce496f97106d9e5acfb8",
"sha256:4214e1379d128b0588021880ccaf40317ee156d4603ac388b9adcf29165e0c04"
"sha256:3caefdcc8f06a57fdc5ef2d22aa7c0bfda4f55e71a0bee74cbf3176d97536ef3",
"sha256:e38bffc89b15bafe2a131f0e1c74924cf07dcec020c2e0a26cccd208831fcd43"
],
"index": "pypi",
"version": "==0.10.3"
"version": "==0.10.4"
},
"wcwidth": {
"hashes": [
@ -571,6 +832,7 @@
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
],
"markers": "python_version >= '3.1'",
"version": "==3.0.4"
},
"coverage": {
@ -663,11 +925,11 @@
},
"faker": {
"hashes": [
"sha256:3f5d379e4b5ce92a8afe3c2ce59d7c43886370dd3bf9495a936b91888debfc81",
"sha256:8c0e8a06acef4b9312902e2ce18becabe62badd3a6632180bd0680c6ee111473"
"sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997",
"sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e"
],
"markers": "python_version >= '3.5'",
"version": "==4.17.0"
"version": "==4.17.1"
},
"filelock": {
"hashes": [
@ -999,11 +1261,11 @@
},
"virtualenv": {
"hashes": [
"sha256:b0011228208944ce71052987437d3843e05690b2f23d1c7da4263fde104c97a2",
"sha256:b8d6110f493af256a40d65e29846c69340a947669eec8ce784fcf3dd3af28380"
"sha256:07cff122e9d343140366055f31be4dcd61fd598c69d11cd33a9d9c8df4546dd7",
"sha256:e0aac7525e880a429764cefd3aaaff54afb5d9f25c82627563603f5d7de5a6e5"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==20.1.0"
"version": "==20.2.1"
}
}
}

View File

@ -218,11 +218,37 @@ PAPERLESS_OCR_LANGUAGE=<lang>
Defaults to "eng".
PAPERLESS_OCR_ALWAYS=<bool>
By default Paperless does not OCR a document if the text can be retrieved from
the document directly. Set to true to always OCR documents.
PAPERLESS_OCR_MODE=<mode>
Tell paperless when and how to perform ocr on your documents. Three modes
are available:
Defaults to false.
* ``skip``: Paperless skips all pages and will perform ocr only on pages
where no text is present. This is the safest and fastest option.
* ``redo``: Paperless will OCR all pages of your documents and attempt to
replace any existing text layers with new text. This will be useful for
documents from scanners that already performed OCR with insufficient
results. It will also perform OCR on purely digital documents.
This option may fail on some documents that have features that cannot
be removed, such as forms. In this case, the text from the document is
used instead.
* ``force``: Paperless rasterizes your documents, converting any text
into images and puts the OCRed text on top. This works for all documents,
however, the resulting document may be significantly larger and text
won't appear as sharp when zoomed in.
The default is ``skip``, which only performs OCR when necessary.
PAPERLESS_OCR_OUTPUT_TYPE=<type>
Specify the the type of PDF documents that paperless should produce.
* ``pdf``: Modify the PDF document as little as possible.
* ``pdfa``: Convert PDF documents into PDF/A documents, which is a
subset of the entire PDF specification and meant for storing
documents long term.
If not specified, ``pdfa`` is used. Remember that paperless also keeps
the original input file as well as the archived version.
PAPERLESS_CONSUMER_POLLING=<num>
If paperless won't find documents added to your consume folder, it might

View File

@ -38,7 +38,8 @@
#PAPERLESS_TIME_ZONE=UTC
#PAPERLESS_OCR_PAGES=1
#PAPERLESS_OCR_LANGUAGE=eng
#PAPERLESS_OCR_ALWAYS=false
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
#PAPERLESS_OCR_MODE=skip
#PAPERLESS_CONSUMER_POLLING=10
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
#PAPERLESS_CONVERT_MEMORY_LIMIT=0

View File

@ -107,23 +107,6 @@ def run_convert(input_file,
raise ParseError("Convert failed at {}".format(args))
def run_unpaper(pnm, logging_group=None):
pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
pnm_out)
logger.debug(f"Execute: {' '.join(command_args)}",
extra={'group': logging_group})
if not subprocess.Popen(command_args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL).wait() == 0:
raise ParseError(f"Unpaper failed at {command_args}")
return pnm_out
class ParseError(Exception):
pass

View File

@ -338,9 +338,13 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
# documents. It should be a 3-letter language code consistent with ISO 639.
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# OCRmyPDF --output-type options are available.
# TODO: validate this setting.
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
# OCR all documents?
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
# skip. redo, force
# TODO: validate this.
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")

View File

@ -1,23 +1,14 @@
import itertools
import os
import re
import subprocess
from multiprocessing.pool import ThreadPool
import langdetect
import ocrmypdf
import pdftotext
import pyocr
from PIL import Image
from django.conf import settings
from pyocr import PyocrException
from ocrmypdf import InputFileError
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
run_convert
from .languages import ISO639
class OCRError(Exception):
pass
from documents.parsers import DocumentParser, ParseError, run_convert
class RasterisedDocumentParser(DocumentParser):
@ -29,6 +20,7 @@ class RasterisedDocumentParser(DocumentParser):
def __init__(self, path, logging_group):
super().__init__(path, logging_group)
self._text = None
self._archive_path = None
def get_thumbnail(self):
"""
@ -74,113 +66,67 @@ class RasterisedDocumentParser(DocumentParser):
return out_path
def _is_ocred(self):
# Extract text from PDF using pdftotext
text = get_text_from_pdf(self.document_path)
# We assume, that a PDF with at least 50 characters contains text
# (so no OCR required)
return len(text) > 50
def get_text(self):
if self._text is not None:
if self._text:
return self._text
if not settings.OCR_ALWAYS and self._is_ocred():
self.log("debug", "Skipping OCR, using Text from PDF")
self._text = get_text_from_pdf(self.document_path)
return self._text
archive_path = os.path.join(self.tempdir, "archive.pdf")
images = self._get_greyscale()
ocr_args = {
'input_file': self.document_path,
'output_file': archive_path,
'use_threads': True,
'jobs': settings.THREADS_PER_WORKER,
'language': settings.OCR_LANGUAGE,
'output_type': settings.OCR_OUTPUT_TYPE,
'progress_bar': False,
'clean': True
}
if not images:
raise ParseError("Empty document, nothing to do.")
if settings.OCR_PAGES > 0:
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
if settings.OCR_MODE == 'skip':
ocr_args['skip_text'] = True
elif settings.OCR_MODE == 'redo':
ocr_args['redo_ocr'] = True
elif settings.OCR_MODE == 'force':
ocr_args['force_ocr'] = True
try:
ocrmypdf.ocr(**ocr_args)
# success! announce that we have an archive document
self._archive_path = archive_path
self._text = get_text_from_pdf(self._archive_path)
sample_page_index = int(len(images) / 2)
self.log(
"debug",
f"Attempting language detection on page "
f"{sample_page_index + 1} of {len(images)}...")
except InputFileError as e:
# This happens with some PDFs when used with the redo_ocr option.
# This is not the end of the world, we'll just use what we already
# have in the document.
self._text = get_text_from_pdf(self.document_path)
# Also, no archived file.
if not self._text:
# However, if we don't have anything, fail:
raise ParseError(e)
sample_page_text = self._ocr([images[sample_page_index]],
settings.OCR_LANGUAGE)[0]
guessed_language = self._guess_language(sample_page_text)
if not guessed_language or guessed_language not in ISO639:
self.log("warning", "Language detection failed.")
ocr_pages = self._complete_ocr_default_language(
images, sample_page_index, sample_page_text)
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
self.log(
"debug",
f"Detected language: {guessed_language} "
f"(default language)")
ocr_pages = self._complete_ocr_default_language(
images, sample_page_index, sample_page_text)
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501
self.log(
"warning",
f"Detected language {guessed_language} is not available "
f"on this system.")
ocr_pages = self._complete_ocr_default_language(
images, sample_page_index, sample_page_text)
else:
self.log("debug", f"Detected language: {guessed_language}")
ocr_pages = self._ocr(images, ISO639[guessed_language])
self.log("debug", "OCR completed.")
self._text = strip_excess_whitespace(" ".join(ocr_pages))
return self._text
except OCRError as e:
except Exception as e:
# Anything else is probably serious.
raise ParseError(e)
def _get_greyscale(self):
"""
Greyscale images are easier for Tesseract to OCR
"""
if not self._text:
# This may happen for files that don't have any text.
self.log(
'warning',
f"Document {self.document_path} does not have any text."
f"This is probably an error or you tried to add an image "
f"without text.")
return ""
# Convert PDF to multiple PNMs
input_file = self.document_path
return self._text
if settings.OCR_PAGES == 1:
input_file += "[0]"
elif settings.OCR_PAGES > 1:
input_file += f"[0-{settings.OCR_PAGES - 1}]"
self.log(
"debug",
f"Converting document {input_file} into greyscale images")
output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
run_convert(density=settings.CONVERT_DENSITY,
depth="8",
type="grayscale",
input_file=input_file,
output_file=output_files,
logging_group=self.logging_group)
# Get a list of converted images
pnms = []
for f in os.listdir(self.tempdir):
if f.endswith(".pnm"):
pnms.append(os.path.join(self.tempdir, f))
self.log("debug", f"Running unpaper on {len(pnms)} pages...")
# Run unpaper in parallel on converted images
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
pnms = pool.map(run_unpaper, pnms)
return sorted(filter(lambda __: os.path.isfile(__), pnms))
def get_archive_path(self):
return self._archive_path
def _guess_language(self, text):
try:
@ -190,30 +136,11 @@ class RasterisedDocumentParser(DocumentParser):
self.log('warning', f"Language detection failed with: {e}")
return None
def _ocr(self, imgs, lang):
self.log(
"debug",
f"Performing OCR on {len(imgs)} page(s) with language {lang}")
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
return r
def _complete_ocr_default_language(self,
images,
sample_page_index,
sample_page):
images_copy = list(images)
del images_copy[sample_page_index]
if images_copy:
self.log('debug', "Continuing ocr with default language.")
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
ocr_pages.insert(sample_page_index, sample_page)
return ocr_pages
else:
return [sample_page]
def strip_excess_whitespace(text):
if not text:
return None
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
@ -222,29 +149,14 @@ def strip_excess_whitespace(text):
return no_trailing_whitespace
def image_to_string(args):
img, lang = args
ocr = pyocr.get_available_tools()[0]
with Image.open(img) as f:
if ocr.can_detect_orientation():
try:
orientation = ocr.detect_orientation(f, lang=lang)
f = f.rotate(orientation["angle"], expand=1)
except Exception:
# Rotation not possible, ignore
pass
try:
return ocr.image_to_string(f, lang=lang)
except PyocrException as e:
raise OCRError(e)
def get_text_from_pdf(pdf_file):
with open(pdf_file, "rb") as f:
try:
pdf = pdftotext.PDF(f)
except pdftotext.Error:
return ""
return None
return "\n".join(pdf)
text = "\n".join(pdf)
return strip_excess_whitespace(text)