From 2d559d330d72800d5859d5eb8c6eff15c1ea4d0a Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:50:43 +0100 Subject: [PATCH] reworked PDF parser that uses OCRmyPDF and produces archive files. --- Pipfile | 2 +- Pipfile.lock | 292 +++++++++++++++++++++++++++-- docs/configuration.rst | 34 +++- paperless.conf.example | 3 +- src/documents/parsers.py | 17 -- src/paperless/settings.py | 8 +- src/paperless_tesseract/parsers.py | 204 ++++++-------------- 7 files changed, 374 insertions(+), 186 deletions(-) diff --git a/Pipfile b/Pipfile index ad60e0905..079037f15 100644 --- a/Pipfile +++ b/Pipfile @@ -23,7 +23,6 @@ langdetect = "*" pdftotext = "*" pathvalidate = "*" pillow = "*" -pyocr = "~=0.7.2" python-gnupg = "*" python-dotenv = "*" python-dateutil = "*" @@ -35,6 +34,7 @@ scikit-learn="~=0.23.2" whitenoise = "~=5.2.0" watchdog = "*" whoosh="~=2.7.4" +ocrmypdf = "*" [dev-packages] coveralls = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 6ecca3c34..39c35c2d9 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ae2643b9cf0cf5741ae149fb6bc0c480de41329ce48e773eb4b5d760bc5e2244" + "sha256": "cf1c008df0080c01273c032aef59bd841e4f27b66beaf3fa459665a7a7a4fcc4" }, "pipfile-spec": 6, "requires": {}, @@ -42,6 +42,94 @@ ], "version": "==1.17.11" }, + "cffi": { + "hashes": [ + "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e", + "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d", + "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a", + "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec", + "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362", + "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668", + "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c", + "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b", + "sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654", + "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06", + "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698", + "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2", + "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c", + "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7", + "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009", + "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03", + "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b", + "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909", + "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53", + "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35", + "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26", + "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b", + "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb", + "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293", + "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd", + "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d", + "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3", + "sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5", + "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d", + "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca", + "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d", + "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775", + "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375", + "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b", + "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b", + "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f" + ], + "version": "==1.14.4" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "markers": "python_version >= '3.1'", + "version": "==3.0.4" + }, + "coloredlogs": { + "hashes": [ + "sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a", + "sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505", + "sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==14.0" + }, + "cryptography": { + "hashes": [ + "sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538", + "sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f", + "sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d", + "sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77", + "sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b", + "sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33", + "sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e", + "sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb", + "sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e", + "sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b", + "sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7", + "sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297", + "sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d", + "sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7", + "sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b", + "sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7", + "sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4", + "sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8", + "sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b", + "sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851", + "sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13", + "sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b", + "sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3", + "sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.2.1" + }, "dateparser": { "hashes": [ "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b", @@ -121,6 +209,14 @@ "index": "pypi", "version": "==20.0.4" }, + "humanfriendly": { + "hashes": [ + "sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12", + "sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==8.2" + }, "imap-tools": { "hashes": [ "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246", @@ -129,6 +225,13 @@ "index": "pypi", "version": "==0.32.0" }, + "img2pdf": { + "hashes": [ + "sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5", + "sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9" + ], + "version": "==0.4.0" + }, "joblib": { "hashes": [ "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72", @@ -146,6 +249,51 @@ "index": "pypi", "version": "==1.0.8" }, + "lxml": { + "hashes": [ + "sha256:098fb713b31050463751dcc694878e1d39f316b86366fb9fe3fbbe5396ac9fab", + "sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b", + "sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5", + "sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301", + "sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b", + "sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d", + "sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b", + "sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9", + "sha256:288ddf94d9d0488187f578fdcc1868af2a6fe6714444c8259b68a83fa27b76d2", + "sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b", + "sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311", + "sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891", + "sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a", + "sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1", + "sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856", + "sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810", + "sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51", + "sha256:4e006fdb434609956a8f710ffffe650afab414dc43728786ebdbdca48e179b14", + "sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360", + "sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4", + "sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f", + "sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230", + "sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a", + "sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f", + "sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174", + "sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf", + "sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd", + "sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3", + "sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a", + "sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5", + "sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367", + "sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c", + "sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1", + "sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8", + "sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f", + "sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc", + "sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d", + "sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9", + "sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==4.6.1" + }, "numpy": { "hashes": [ "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db", @@ -187,6 +335,14 @@ "markers": "python_version >= '3.6'", "version": "==1.19.4" }, + "ocrmypdf": { + "hashes": [ + "sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3", + "sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928" + ], + "index": "pypi", + "version": "==11.3.3" + }, "pathtools": { "hashes": [ "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0", @@ -202,6 +358,14 @@ "index": "pypi", "version": "==2.3.0" }, + "pdfminer.six": { + "hashes": [ + "sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a", + "sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509" + ], + "markers": "python_version >= '3.4'", + "version": "==20201018" + }, "pdftotext": { "hashes": [ "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6" @@ -209,6 +373,33 @@ "index": "pypi", "version": "==2.1.5" }, + "pikepdf": { + "hashes": [ + "sha256:0dd42f791f29e7e2ab120103605b9ddd65937c773a72d21341a56873a89e76c9", + "sha256:12a1d243143cf972ce11def50f0bd1f6e630f5e660cdeddb2c7c49db5adad40a", + "sha256:2e1713af11b71e95c2d218c10d68b6f8e813be19c8596c560f3c84617f6d5437", + "sha256:2f90acad26d9939193946eb6ca8363fd3cf44b46b5c1409468906618bccb8113", + "sha256:3c482fe30fd58ff385795605a9233f37f97fb83427c3e829b1a568a2a3b59f60", + "sha256:3ddabfc33a8a7cecba76c1685ce5125fdf239a38d0854d7c2a703490b5783773", + "sha256:61dd3f13b7416111d19bf493ce4e7281f63a1dd22c532200cbbcd65813ea43e4", + "sha256:6ce42b7780835fb52452ccaff3a3ac1b28ae1f9d80faab59c559045d9fcb211d", + "sha256:6dba75782f108ebbf3947fcb29ea0ba7da0482868e53f6602643adc36245201d", + "sha256:716427a5c0372f3cc7dc282c4b49d49d8d5182a3e937739a4c3632151e74d6a4", + "sha256:730ef4013099da7ea722a9b5659260097af6f47ddfa3c2abab4d4493de2591f3", + "sha256:73e14bba4135adfb89ae2f2163369bd788ecf23839acc8d062d832118f07e288", + "sha256:84df07acc8968051da33891af55a3ab1aa55453d83df4ce9b84d821eedc34583", + "sha256:8f739e9c660d71cd479f11f9aa110857cf0d0d9c2472f40bbcbaf02f980355a1", + "sha256:a20ca7adbb9d3da416cf5f6de0ebca53855f9a3b99acdd6ec864c61482894d71", + "sha256:bc58d9486c0959619a2584e558a54d36468c6d1165cd9fe0bfb1ecc3e6b33c6a", + "sha256:c0627930a17b3a5e1a7c9109099535259afc50fe006a05af9c3634de05abd318", + "sha256:de5f445eaaadd7dae56e1043ab8ca5eef49ece302a4e37e1fc6d21b7dcfcfb1b", + "sha256:de6aae7782db33f2cc71c9ba63b7e2ec0e0529843c065eac4e71fcbe043426e2", + "sha256:e2efd844c09f8ce3103a93bfbd54983542a0a63c88bdc0f0cdbb2997f99a147d", + "sha256:fdb481ad1219e8d667625afd2f01b26f98df079e4f66e7e49816ec20c8d8c401" + ], + "markers": "python_version < '3.9'", + "version": "==2.1.2" + }, "pillow": { "hashes": [ "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a", @@ -244,6 +435,14 @@ "index": "pypi", "version": "==8.0.1" }, + "pluggy": { + "hashes": [ + "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", + "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.13.1" + }, "psycopg2-binary": { "hashes": [ "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c", @@ -287,13 +486,13 @@ "index": "pypi", "version": "==2.8.6" }, - "pyocr": { + "pycparser": { "hashes": [ - "sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179", - "sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301" + "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", + "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" ], - "index": "pypi", - "version": "==0.7.2" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.20" }, "python-dateutil": { "hashes": [ @@ -401,6 +600,53 @@ ], "version": "==2020.11.13" }, + "reportlab": { + "hashes": [ + "sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540", + "sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c", + "sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030", + "sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f", + "sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5", + "sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e", + "sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89", + "sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2", + "sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f", + "sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969", + "sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53", + "sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d", + "sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d", + "sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457", + "sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec", + "sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f", + "sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778", + "sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92", + "sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6", + "sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077", + "sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673", + "sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b", + "sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48", + "sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95", + "sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94", + "sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589", + "sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9", + "sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8", + "sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d", + "sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32", + "sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b", + "sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a", + "sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2", + "sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967", + "sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5", + "sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437", + "sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8", + "sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab", + "sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e", + "sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a", + "sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8", + "sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08" + ], + "version": "==3.5.55" + }, "scikit-learn": { "hashes": [ "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e", @@ -464,6 +710,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.15.0" }, + "sortedcontainers": { + "hashes": [ + "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f", + "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1" + ], + "version": "==2.3.0" + }, "sqlparse": { "hashes": [ "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0", @@ -480,6 +733,14 @@ "markers": "python_version >= '3.5'", "version": "==2.1.0" }, + "tqdm": { + "hashes": [ + "sha256:3d3f1470d26642e88bd3f73353cb6ff4c51ef7d5d7efef763238f4bc1f7e4e81", + "sha256:5ff3f5232b19fa4c5531641e480b7fad4598819f708a32eb815e6ea41c5fa313" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==4.53.0" + }, "tzlocal": { "hashes": [ "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44", @@ -489,11 +750,11 @@ }, "watchdog": { "hashes": [ - "sha256:034c85530b647486e8c8477410fe79476511282658f2ce496f97106d9e5acfb8", - "sha256:4214e1379d128b0588021880ccaf40317ee156d4603ac388b9adcf29165e0c04" + "sha256:3caefdcc8f06a57fdc5ef2d22aa7c0bfda4f55e71a0bee74cbf3176d97536ef3", + "sha256:e38bffc89b15bafe2a131f0e1c74924cf07dcec020c2e0a26cccd208831fcd43" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.10.4" }, "wcwidth": { "hashes": [ @@ -571,6 +832,7 @@ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" ], + "markers": "python_version >= '3.1'", "version": "==3.0.4" }, "coverage": { @@ -663,11 +925,11 @@ }, "faker": { "hashes": [ - "sha256:3f5d379e4b5ce92a8afe3c2ce59d7c43886370dd3bf9495a936b91888debfc81", - "sha256:8c0e8a06acef4b9312902e2ce18becabe62badd3a6632180bd0680c6ee111473" + "sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997", + "sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e" ], "markers": "python_version >= '3.5'", - "version": "==4.17.0" + "version": "==4.17.1" }, "filelock": { "hashes": [ @@ -999,11 +1261,11 @@ }, "virtualenv": { "hashes": [ - "sha256:b0011228208944ce71052987437d3843e05690b2f23d1c7da4263fde104c97a2", - "sha256:b8d6110f493af256a40d65e29846c69340a947669eec8ce784fcf3dd3af28380" + "sha256:07cff122e9d343140366055f31be4dcd61fd598c69d11cd33a9d9c8df4546dd7", + "sha256:e0aac7525e880a429764cefd3aaaff54afb5d9f25c82627563603f5d7de5a6e5" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==20.1.0" + "version": "==20.2.1" } } } diff --git a/docs/configuration.rst b/docs/configuration.rst index c3f01c2ca..ad1c7c117 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -218,11 +218,37 @@ PAPERLESS_OCR_LANGUAGE= Defaults to "eng". -PAPERLESS_OCR_ALWAYS= - By default Paperless does not OCR a document if the text can be retrieved from - the document directly. Set to true to always OCR documents. +PAPERLESS_OCR_MODE= + Tell paperless when and how to perform ocr on your documents. Three modes + are available: - Defaults to false. + * ``skip``: Paperless skips all pages and will perform ocr only on pages + where no text is present. This is the safest and fastest option. + * ``redo``: Paperless will OCR all pages of your documents and attempt to + replace any existing text layers with new text. This will be useful for + documents from scanners that already performed OCR with insufficient + results. It will also perform OCR on purely digital documents. + + This option may fail on some documents that have features that cannot + be removed, such as forms. In this case, the text from the document is + used instead. + * ``force``: Paperless rasterizes your documents, converting any text + into images and puts the OCRed text on top. This works for all documents, + however, the resulting document may be significantly larger and text + won't appear as sharp when zoomed in. + + The default is ``skip``, which only performs OCR when necessary. + +PAPERLESS_OCR_OUTPUT_TYPE= + Specify the the type of PDF documents that paperless should produce. + + * ``pdf``: Modify the PDF document as little as possible. + * ``pdfa``: Convert PDF documents into PDF/A documents, which is a + subset of the entire PDF specification and meant for storing + documents long term. + + If not specified, ``pdfa`` is used. Remember that paperless also keeps + the original input file as well as the archived version. PAPERLESS_CONSUMER_POLLING= If paperless won't find documents added to your consume folder, it might diff --git a/paperless.conf.example b/paperless.conf.example index 4962c1567..34e560507 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -38,7 +38,8 @@ #PAPERLESS_TIME_ZONE=UTC #PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_LANGUAGE=eng -#PAPERLESS_OCR_ALWAYS=false +#PAPERLESS_OCR_OUTPUT_TYPE=pdfa +#PAPERLESS_OCR_MODE=skip #PAPERLESS_CONSUMER_POLLING=10 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false #PAPERLESS_CONVERT_MEMORY_LIMIT=0 diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 3ad60dccd..542a5dae9 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -107,23 +107,6 @@ def run_convert(input_file, raise ParseError("Convert failed at {}".format(args)) -def run_unpaper(pnm, logging_group=None): - pnm_out = pnm.replace(".pnm", ".unpaper.pnm") - - command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, - pnm_out) - - logger.debug(f"Execute: {' '.join(command_args)}", - extra={'group': logging_group}) - - if not subprocess.Popen(command_args, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL).wait() == 0: - raise ParseError(f"Unpaper failed at {command_args}") - - return pnm_out - - class ParseError(Exception): pass diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 66f9fee4b..5cede45c4 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -338,9 +338,13 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") +# OCRmyPDF --output-type options are available. +# TODO: validate this setting. +OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") -# OCR all documents? -OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false") +# skip. redo, force +# TODO: validate this. +OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b8320a4f0..8f694ef56 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -1,23 +1,14 @@ -import itertools import os import re import subprocess -from multiprocessing.pool import ThreadPool import langdetect +import ocrmypdf import pdftotext -import pyocr -from PIL import Image from django.conf import settings -from pyocr import PyocrException +from ocrmypdf import InputFileError -from documents.parsers import DocumentParser, ParseError, run_unpaper, \ - run_convert -from .languages import ISO639 - - -class OCRError(Exception): - pass +from documents.parsers import DocumentParser, ParseError, run_convert class RasterisedDocumentParser(DocumentParser): @@ -29,6 +20,7 @@ class RasterisedDocumentParser(DocumentParser): def __init__(self, path, logging_group): super().__init__(path, logging_group) self._text = None + self._archive_path = None def get_thumbnail(self): """ @@ -74,113 +66,67 @@ class RasterisedDocumentParser(DocumentParser): return out_path - def _is_ocred(self): - - # Extract text from PDF using pdftotext - text = get_text_from_pdf(self.document_path) - - # We assume, that a PDF with at least 50 characters contains text - # (so no OCR required) - return len(text) > 50 - def get_text(self): - if self._text is not None: + if self._text: return self._text - if not settings.OCR_ALWAYS and self._is_ocred(): - self.log("debug", "Skipping OCR, using Text from PDF") - self._text = get_text_from_pdf(self.document_path) - return self._text + archive_path = os.path.join(self.tempdir, "archive.pdf") - images = self._get_greyscale() + ocr_args = { + 'input_file': self.document_path, + 'output_file': archive_path, + 'use_threads': True, + 'jobs': settings.THREADS_PER_WORKER, + 'language': settings.OCR_LANGUAGE, + 'output_type': settings.OCR_OUTPUT_TYPE, + 'progress_bar': False, + 'clean': True + } - if not images: - raise ParseError("Empty document, nothing to do.") + if settings.OCR_PAGES > 0: + ocr_args['pages'] = f"1-{settings.OCR_PAGES}" + + if settings.OCR_MODE == 'skip': + ocr_args['skip_text'] = True + elif settings.OCR_MODE == 'redo': + ocr_args['redo_ocr'] = True + elif settings.OCR_MODE == 'force': + ocr_args['force_ocr'] = True try: + ocrmypdf.ocr(**ocr_args) + # success! announce that we have an archive document + self._archive_path = archive_path + self._text = get_text_from_pdf(self._archive_path) - sample_page_index = int(len(images) / 2) - self.log( - "debug", - f"Attempting language detection on page " - f"{sample_page_index + 1} of {len(images)}...") + except InputFileError as e: + # This happens with some PDFs when used with the redo_ocr option. + # This is not the end of the world, we'll just use what we already + # have in the document. + self._text = get_text_from_pdf(self.document_path) + # Also, no archived file. + if not self._text: + # However, if we don't have anything, fail: + raise ParseError(e) - sample_page_text = self._ocr([images[sample_page_index]], - settings.OCR_LANGUAGE)[0] - guessed_language = self._guess_language(sample_page_text) - - if not guessed_language or guessed_language not in ISO639: - self.log("warning", "Language detection failed.") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - elif ISO639[guessed_language] == settings.OCR_LANGUAGE: - self.log( - "debug", - f"Detected language: {guessed_language} " - f"(default language)") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501 - self.log( - "warning", - f"Detected language {guessed_language} is not available " - f"on this system.") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - else: - self.log("debug", f"Detected language: {guessed_language}") - ocr_pages = self._ocr(images, ISO639[guessed_language]) - - self.log("debug", "OCR completed.") - self._text = strip_excess_whitespace(" ".join(ocr_pages)) - return self._text - - except OCRError as e: + except Exception as e: + # Anything else is probably serious. raise ParseError(e) - def _get_greyscale(self): - """ - Greyscale images are easier for Tesseract to OCR - """ + if not self._text: + # This may happen for files that don't have any text. + self.log( + 'warning', + f"Document {self.document_path} does not have any text." + f"This is probably an error or you tried to add an image " + f"without text.") + return "" - # Convert PDF to multiple PNMs - input_file = self.document_path + return self._text - if settings.OCR_PAGES == 1: - input_file += "[0]" - elif settings.OCR_PAGES > 1: - input_file += f"[0-{settings.OCR_PAGES - 1}]" - - self.log( - "debug", - f"Converting document {input_file} into greyscale images") - - output_files = os.path.join(self.tempdir, "convert-%04d.pnm") - - run_convert(density=settings.CONVERT_DENSITY, - depth="8", - type="grayscale", - input_file=input_file, - output_file=output_files, - logging_group=self.logging_group) - - # Get a list of converted images - pnms = [] - for f in os.listdir(self.tempdir): - if f.endswith(".pnm"): - pnms.append(os.path.join(self.tempdir, f)) - - self.log("debug", f"Running unpaper on {len(pnms)} pages...") - - # Run unpaper in parallel on converted images - with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: - pnms = pool.map(run_unpaper, pnms) - - return sorted(filter(lambda __: os.path.isfile(__), pnms)) + def get_archive_path(self): + return self._archive_path def _guess_language(self, text): try: @@ -190,30 +136,11 @@ class RasterisedDocumentParser(DocumentParser): self.log('warning', f"Language detection failed with: {e}") return None - def _ocr(self, imgs, lang): - self.log( - "debug", - f"Performing OCR on {len(imgs)} page(s) with language {lang}") - with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: - r = pool.map(image_to_string, itertools.product(imgs, [lang])) - return r - - def _complete_ocr_default_language(self, - images, - sample_page_index, - sample_page): - images_copy = list(images) - del images_copy[sample_page_index] - if images_copy: - self.log('debug', "Continuing ocr with default language.") - ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) - ocr_pages.insert(sample_page_index, sample_page) - return ocr_pages - else: - return [sample_page] - def strip_excess_whitespace(text): + if not text: + return None + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) no_leading_whitespace = re.sub( r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) @@ -222,29 +149,14 @@ def strip_excess_whitespace(text): return no_trailing_whitespace -def image_to_string(args): - img, lang = args - ocr = pyocr.get_available_tools()[0] - with Image.open(img) as f: - if ocr.can_detect_orientation(): - try: - orientation = ocr.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) - except Exception: - # Rotation not possible, ignore - pass - try: - return ocr.image_to_string(f, lang=lang) - except PyocrException as e: - raise OCRError(e) - - def get_text_from_pdf(pdf_file): with open(pdf_file, "rb") as f: try: pdf = pdftotext.PDF(f) except pdftotext.Error: - return "" + return None - return "\n".join(pdf) + text = "\n".join(pdf) + + return strip_excess_whitespace(text)