Merge branch 'feature-ocrmypdf' into dev
							
								
								
									
										13
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						| @@ -76,16 +76,11 @@ scripts/nuke | ||||
| /static/ | ||||
|  | ||||
| # Stored PDFs | ||||
| /media/documents/originals/* | ||||
| /media/documents/thumbnails/* | ||||
|  | ||||
| /data/classification_model.pickle | ||||
| /data/db.sqlite3 | ||||
| /data/index | ||||
|  | ||||
| /media/ | ||||
| /data/ | ||||
| /paperless.conf | ||||
| /consume | ||||
| /export | ||||
| /consume/ | ||||
| /export/ | ||||
| /src-ui/.vscode | ||||
|  | ||||
| # this is where the compiled frontend is moved to. | ||||
|   | ||||
| @@ -1,5 +1,8 @@ | ||||
| language: python | ||||
|  | ||||
| dist: focal | ||||
| os: linux | ||||
|  | ||||
| jobs: | ||||
|   include: | ||||
|     - name: "Paperless on Python 3.6" | ||||
| @@ -33,7 +36,7 @@ jobs: | ||||
|  | ||||
| before_install: | ||||
|   - sudo apt-get update -qq | ||||
|   - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript | ||||
|   - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript optipng | ||||
|  | ||||
| install: | ||||
|   - pip install --upgrade pipenv | ||||
|   | ||||
							
								
								
									
										2
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						| @@ -26,7 +26,6 @@ langdetect = "*" | ||||
| pdftotext = "*" | ||||
| pathvalidate = "*" | ||||
| pillow = "*" | ||||
| pyocr = "~=0.7.2" | ||||
| python-gnupg = "*" | ||||
| python-dotenv = "*" | ||||
| python-dateutil = "*" | ||||
| @@ -39,6 +38,7 @@ whitenoise = "~=5.2.0" | ||||
| watchdog = "*" | ||||
| whoosh="~=2.7.4" | ||||
| inotifyrecursive = ">=0.3.4" | ||||
| ocrmypdf = "*" | ||||
|  | ||||
| [dev-packages] | ||||
| coveralls = "*" | ||||
|   | ||||
							
								
								
									
										298
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						| @@ -1,7 +1,7 @@ | ||||
| { | ||||
|     "_meta": { | ||||
|         "hash": { | ||||
|             "sha256": "d266e1f67e3090ec68aa8ecba1e8373351daf89ad5a5ab46524d123bcaf29f62" | ||||
|             "sha256": "55c9136777e78d6cd362628cd1fc0c5ff36b437699b92089ce504d598004371d" | ||||
|         }, | ||||
|         "pipfile-spec": 6, | ||||
|         "requires": { | ||||
| @@ -44,6 +44,94 @@ | ||||
|             ], | ||||
|             "version": "==1.17.12" | ||||
|         }, | ||||
|         "cffi": { | ||||
|             "hashes": [ | ||||
|                 "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e", | ||||
|                 "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d", | ||||
|                 "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a", | ||||
|                 "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec", | ||||
|                 "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362", | ||||
|                 "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668", | ||||
|                 "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c", | ||||
|                 "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b", | ||||
|                 "sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654", | ||||
|                 "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06", | ||||
|                 "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698", | ||||
|                 "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2", | ||||
|                 "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c", | ||||
|                 "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7", | ||||
|                 "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009", | ||||
|                 "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03", | ||||
|                 "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b", | ||||
|                 "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909", | ||||
|                 "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53", | ||||
|                 "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35", | ||||
|                 "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26", | ||||
|                 "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b", | ||||
|                 "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb", | ||||
|                 "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293", | ||||
|                 "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd", | ||||
|                 "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d", | ||||
|                 "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3", | ||||
|                 "sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5", | ||||
|                 "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d", | ||||
|                 "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca", | ||||
|                 "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d", | ||||
|                 "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775", | ||||
|                 "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375", | ||||
|                 "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b", | ||||
|                 "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b", | ||||
|                 "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f" | ||||
|             ], | ||||
|             "version": "==1.14.4" | ||||
|         }, | ||||
|         "chardet": { | ||||
|             "hashes": [ | ||||
|                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", | ||||
|                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" | ||||
|             ], | ||||
|             "markers": "python_version >= '3.1'", | ||||
|             "version": "==3.0.4" | ||||
|         }, | ||||
|         "coloredlogs": { | ||||
|             "hashes": [ | ||||
|                 "sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a", | ||||
|                 "sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505", | ||||
|                 "sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97" | ||||
|             ], | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", | ||||
|             "version": "==14.0" | ||||
|         }, | ||||
|         "cryptography": { | ||||
|             "hashes": [ | ||||
|                 "sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538", | ||||
|                 "sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f", | ||||
|                 "sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d", | ||||
|                 "sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77", | ||||
|                 "sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b", | ||||
|                 "sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33", | ||||
|                 "sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e", | ||||
|                 "sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb", | ||||
|                 "sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e", | ||||
|                 "sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b", | ||||
|                 "sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7", | ||||
|                 "sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297", | ||||
|                 "sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d", | ||||
|                 "sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7", | ||||
|                 "sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b", | ||||
|                 "sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7", | ||||
|                 "sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4", | ||||
|                 "sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8", | ||||
|                 "sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b", | ||||
|                 "sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851", | ||||
|                 "sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13", | ||||
|                 "sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b", | ||||
|                 "sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3", | ||||
|                 "sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df" | ||||
|             ], | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", | ||||
|             "version": "==3.2.1" | ||||
|         }, | ||||
|         "dateparser": { | ||||
|             "hashes": [ | ||||
|                 "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b", | ||||
| @@ -123,6 +211,14 @@ | ||||
|             "index": "pypi", | ||||
|             "version": "==20.0.4" | ||||
|         }, | ||||
|         "humanfriendly": { | ||||
|             "hashes": [ | ||||
|                 "sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12", | ||||
|                 "sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080" | ||||
|             ], | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", | ||||
|             "version": "==8.2" | ||||
|         }, | ||||
|         "imap-tools": { | ||||
|             "hashes": [ | ||||
|                 "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246", | ||||
| @@ -131,6 +227,13 @@ | ||||
|             "index": "pypi", | ||||
|             "version": "==0.32.0" | ||||
|         }, | ||||
|         "img2pdf": { | ||||
|             "hashes": [ | ||||
|                 "sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5", | ||||
|                 "sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9" | ||||
|             ], | ||||
|             "version": "==0.4.0" | ||||
|         }, | ||||
|         "inotify-simple": { | ||||
|             "hashes": [ | ||||
|                 "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128", | ||||
| @@ -164,6 +267,51 @@ | ||||
|             "index": "pypi", | ||||
|             "version": "==1.0.8" | ||||
|         }, | ||||
|         "lxml": { | ||||
|             "hashes": [ | ||||
|                 "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d", | ||||
|                 "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37", | ||||
|                 "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01", | ||||
|                 "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2", | ||||
|                 "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644", | ||||
|                 "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75", | ||||
|                 "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80", | ||||
|                 "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2", | ||||
|                 "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780", | ||||
|                 "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98", | ||||
|                 "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308", | ||||
|                 "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf", | ||||
|                 "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388", | ||||
|                 "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d", | ||||
|                 "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3", | ||||
|                 "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8", | ||||
|                 "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af", | ||||
|                 "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2", | ||||
|                 "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e", | ||||
|                 "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939", | ||||
|                 "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03", | ||||
|                 "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d", | ||||
|                 "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a", | ||||
|                 "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5", | ||||
|                 "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a", | ||||
|                 "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711", | ||||
|                 "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf", | ||||
|                 "sha256:91d6dace31b07ab47eeadd3f4384ded2f77b94b30446410cb2c3e660e047f7a7", | ||||
|                 "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089", | ||||
|                 "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505", | ||||
|                 "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b", | ||||
|                 "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f", | ||||
|                 "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc", | ||||
|                 "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e", | ||||
|                 "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931", | ||||
|                 "sha256:e1dbb88a937126ab14d219a000728224702e0ec0fc7ceb7131c53606b7a76772", | ||||
|                 "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc", | ||||
|                 "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe", | ||||
|                 "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e" | ||||
|             ], | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", | ||||
|             "version": "==4.6.2" | ||||
|         }, | ||||
|         "numpy": { | ||||
|             "hashes": [ | ||||
|                 "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db", | ||||
| @@ -205,6 +353,14 @@ | ||||
|             "markers": "python_version >= '3.6'", | ||||
|             "version": "==1.19.4" | ||||
|         }, | ||||
|         "ocrmypdf": { | ||||
|             "hashes": [ | ||||
|                 "sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3", | ||||
|                 "sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928" | ||||
|             ], | ||||
|             "index": "pypi", | ||||
|             "version": "==11.3.3" | ||||
|         }, | ||||
|         "pathtools": { | ||||
|             "hashes": [ | ||||
|                 "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0", | ||||
| @@ -220,6 +376,14 @@ | ||||
|             "index": "pypi", | ||||
|             "version": "==2.3.0" | ||||
|         }, | ||||
|         "pdfminer.six": { | ||||
|             "hashes": [ | ||||
|                 "sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a", | ||||
|                 "sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509" | ||||
|             ], | ||||
|             "markers": "python_version >= '3.4'", | ||||
|             "version": "==20201018" | ||||
|         }, | ||||
|         "pdftotext": { | ||||
|             "hashes": [ | ||||
|                 "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6" | ||||
| @@ -227,6 +391,33 @@ | ||||
|             "index": "pypi", | ||||
|             "version": "==2.1.5" | ||||
|         }, | ||||
|         "pikepdf": { | ||||
|             "hashes": [ | ||||
|                 "sha256:0829bd5dacd73bb4a37e7575bae523f49603479755563c92ddb55c206700cab1", | ||||
|                 "sha256:0d2b631077cd6af6e4d1b396208020705842610a6f13fab489d5f9c47916baa2", | ||||
|                 "sha256:21c98af08fae4ac9fbcad02b613b6768a4ca300fda4cba867f4a4b6f73c2d04b", | ||||
|                 "sha256:2240372fed30124ddc35b0c15a613f2b687a426ea2f150091e0a0c58cca7a495", | ||||
|                 "sha256:2a97f5f1403e058d217d7f6861cf51fca200c5687bce0d052f5f2fa89b5bfa22", | ||||
|                 "sha256:3faaefca0ae80d19891acec8b0dd5e6235f59f2206d82375eb80d090285e9557", | ||||
|                 "sha256:48ef45b64882901c0d69af3b85d16a19bd0f3e95b43e614fefb53521d8caf36c", | ||||
|                 "sha256:5212fe41f2323fc7356ba67caa39737fe13080562cff37bcbb74a8094076c8d0", | ||||
|                 "sha256:56859c32170663c57bd0658189ce44e180533eebe813853446cd6413810be9eb", | ||||
|                 "sha256:5f8fd1cb3478c5534222018aca24fbbd2bc74460c899bda988ec76722c13caa9", | ||||
|                 "sha256:74300a32c41b3d578772f6933f23a88b19f74484185e71e5225ce2f7ea5aea78", | ||||
|                 "sha256:8cbc946bdd217148f4a9c029fcea62f4ae0f67d5346de4c865f4718cd0ddc37f", | ||||
|                 "sha256:9ceefd30076f732530cf84a1be2ecb2fa9931af932706ded760a6d37c73b96ad", | ||||
|                 "sha256:ad69c170fda41b07a4c6b668a3128e7a759f50d9aebcfcde0ccff1358abe0423", | ||||
|                 "sha256:b715fe182189fb6870fab5b0383bb2fb278c88c46eade346b0f4c1ed8818c09d", | ||||
|                 "sha256:bb01ecf95083ffcb9ad542dc5342ccc1059e46f1395fd966629d36d9cc766b4a", | ||||
|                 "sha256:bd6328547219cf48cefb4e0a1bc54442910594de1c5a5feae847d9ff3c629031", | ||||
|                 "sha256:edb128379bb1dea76b5bdbdacf5657a6e4754bacc2049640762725590d8ed905", | ||||
|                 "sha256:f8e687900557fcd4c51b4e72b9e337fdae9e2c81049d1d80b624bb2e88b5769d", | ||||
|                 "sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52", | ||||
|                 "sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef" | ||||
|             ], | ||||
|             "markers": "python_version < '3.9'", | ||||
|             "version": "==2.2.0" | ||||
|         }, | ||||
|         "pillow": { | ||||
|             "hashes": [ | ||||
|                 "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a", | ||||
| @@ -262,6 +453,14 @@ | ||||
|             "index": "pypi", | ||||
|             "version": "==8.0.1" | ||||
|         }, | ||||
|         "pluggy": { | ||||
|             "hashes": [ | ||||
|                 "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", | ||||
|                 "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" | ||||
|             ], | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||
|             "version": "==0.13.1" | ||||
|         }, | ||||
|         "psycopg2-binary": { | ||||
|             "hashes": [ | ||||
|                 "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c", | ||||
| @@ -305,13 +504,13 @@ | ||||
|             "index": "pypi", | ||||
|             "version": "==2.8.6" | ||||
|         }, | ||||
|         "pyocr": { | ||||
|         "pycparser": { | ||||
|             "hashes": [ | ||||
|                 "sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179", | ||||
|                 "sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301" | ||||
|                 "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", | ||||
|                 "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" | ||||
|             ], | ||||
|             "index": "pypi", | ||||
|             "version": "==0.7.2" | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||
|             "version": "==2.20" | ||||
|         }, | ||||
|         "python-dateutil": { | ||||
|             "hashes": [ | ||||
| @@ -419,6 +618,53 @@ | ||||
|             ], | ||||
|             "version": "==2020.11.13" | ||||
|         }, | ||||
|         "reportlab": { | ||||
|             "hashes": [ | ||||
|                 "sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540", | ||||
|                 "sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c", | ||||
|                 "sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030", | ||||
|                 "sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f", | ||||
|                 "sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5", | ||||
|                 "sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e", | ||||
|                 "sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89", | ||||
|                 "sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2", | ||||
|                 "sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f", | ||||
|                 "sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969", | ||||
|                 "sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53", | ||||
|                 "sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d", | ||||
|                 "sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d", | ||||
|                 "sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457", | ||||
|                 "sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec", | ||||
|                 "sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f", | ||||
|                 "sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778", | ||||
|                 "sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92", | ||||
|                 "sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6", | ||||
|                 "sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077", | ||||
|                 "sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673", | ||||
|                 "sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b", | ||||
|                 "sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48", | ||||
|                 "sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95", | ||||
|                 "sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94", | ||||
|                 "sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589", | ||||
|                 "sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9", | ||||
|                 "sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8", | ||||
|                 "sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d", | ||||
|                 "sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32", | ||||
|                 "sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b", | ||||
|                 "sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a", | ||||
|                 "sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2", | ||||
|                 "sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967", | ||||
|                 "sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5", | ||||
|                 "sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437", | ||||
|                 "sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8", | ||||
|                 "sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab", | ||||
|                 "sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e", | ||||
|                 "sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a", | ||||
|                 "sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8", | ||||
|                 "sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08" | ||||
|             ], | ||||
|             "version": "==3.5.55" | ||||
|         }, | ||||
|         "scikit-learn": { | ||||
|             "hashes": [ | ||||
|                 "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e", | ||||
| @@ -482,6 +728,13 @@ | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||
|             "version": "==1.15.0" | ||||
|         }, | ||||
|         "sortedcontainers": { | ||||
|             "hashes": [ | ||||
|                 "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f", | ||||
|                 "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1" | ||||
|             ], | ||||
|             "version": "==2.3.0" | ||||
|         }, | ||||
|         "sqlparse": { | ||||
|             "hashes": [ | ||||
|                 "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0", | ||||
| @@ -498,6 +751,14 @@ | ||||
|             "markers": "python_version >= '3.5'", | ||||
|             "version": "==2.1.0" | ||||
|         }, | ||||
|         "tqdm": { | ||||
|             "hashes": [ | ||||
|                 "sha256:5c0d04e06ccc0da1bd3fa5ae4550effcce42fcad947b4a6cafa77bdc9b09ff22", | ||||
|                 "sha256:9e7b8ab0ecbdbf0595adadd5f0ebbb9e69010e0bd48bbb0c15e550bf2a5292df" | ||||
|             ], | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||
|             "version": "==4.54.0" | ||||
|         }, | ||||
|         "tzlocal": { | ||||
|             "hashes": [ | ||||
|                 "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44", | ||||
| @@ -589,6 +850,7 @@ | ||||
|                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", | ||||
|                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" | ||||
|             ], | ||||
|             "markers": "python_version >= '3.1'", | ||||
|             "version": "==3.0.4" | ||||
|         }, | ||||
|         "coverage": { | ||||
| @@ -711,22 +973,6 @@ | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||
|             "version": "==1.2.0" | ||||
|         }, | ||||
|         "importlib-metadata": { | ||||
|             "hashes": [ | ||||
|                 "sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132", | ||||
|                 "sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c" | ||||
|             ], | ||||
|             "markers": "python_version < '3.8'", | ||||
|             "version": "==2.1.0" | ||||
|         }, | ||||
|         "importlib-resources": { | ||||
|             "hashes": [ | ||||
|                 "sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592", | ||||
|                 "sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5" | ||||
|             ], | ||||
|             "markers": "python_version < '3.7'", | ||||
|             "version": "==3.3.0" | ||||
|         }, | ||||
|         "iniconfig": { | ||||
|             "hashes": [ | ||||
|                 "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", | ||||
| @@ -1038,14 +1284,6 @@ | ||||
|             ], | ||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||
|             "version": "==20.2.1" | ||||
|         }, | ||||
|         "zipp": { | ||||
|             "hashes": [ | ||||
|                 "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108", | ||||
|                 "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb" | ||||
|             ], | ||||
|             "markers": "python_version < '3.8'", | ||||
|             "version": "==3.4.0" | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -152,6 +152,117 @@ PAPERLESS_AUTO_LOGIN_USERNAME=<username> | ||||
|  | ||||
|     Defaults to none, which disables this feature. | ||||
|  | ||||
| OCR settings | ||||
| ############ | ||||
|  | ||||
| Paperless uses `OCRmyPDF <https://ocrmypdf.readthedocs.io/en/latest/>`_ for | ||||
| performing OCR on documents and images. Paperless uses sensible defaults for | ||||
| most settings, but all of them can be configured to your needs. | ||||
|  | ||||
|  | ||||
| PAPERLESS_OCR_LANGUAGE=<lang> | ||||
|     Customize the language that paperless will attempt to use when | ||||
|     parsing documents. | ||||
|  | ||||
|     It should be a 3-letter language code consistent with ISO | ||||
|     639: https://www.loc.gov/standards/iso639-2/php/code_list.php | ||||
|  | ||||
|     Set this to the language most of your documents are written in. | ||||
|  | ||||
|     This can be a combination of multiple languages such as ``deu+eng``, | ||||
|     in which case tesseract will use whatever language matches best. | ||||
|     Keep in mind that tesseract uses much more cpu time with multiple | ||||
|     languages enabled. | ||||
|  | ||||
|     Defaults to "eng". | ||||
|  | ||||
| PAPERLESS_OCR_MODE=<mode> | ||||
|     Tell paperless when and how to perform ocr on your documents. Four modes | ||||
|     are available: | ||||
|  | ||||
|     *   ``skip``: Paperless skips all pages and will perform ocr only on pages | ||||
|         where no text is present. This is the safest and fastest option. | ||||
|     *   ``skip_noarchive``: In addition to skip, paperless won't create an | ||||
|         archived version of your documents when it finds any text in them. | ||||
|     *   ``redo``: Paperless will OCR all pages of your documents and attempt to | ||||
|         replace any existing text layers with new text. This will be useful for | ||||
|         documents from scanners that already performed OCR with insufficient | ||||
|         results. It will also perform OCR on purely digital documents. | ||||
|  | ||||
|         This option may fail on some documents that have features that cannot | ||||
|         be removed, such as forms. In this case, the text from the document is | ||||
|         used instead. | ||||
|     *   ``force``: Paperless rasterizes your documents, converting any text | ||||
|         into images and puts the OCRed text on top. This works for all documents, | ||||
|         however, the resulting document may be significantly larger and text | ||||
|         won't appear as sharp when zoomed in. | ||||
|      | ||||
|     The default is ``skip``, which only performs OCR when necessary. | ||||
|  | ||||
| PAPERLESS_OCR_OUTPUT_TYPE=<type> | ||||
|     Specify the the type of PDF documents that paperless should produce. | ||||
|      | ||||
|     *   ``pdf``: Modify the PDF document as little as possible. | ||||
|     *   ``pdfa``: Convert PDF documents into PDF/A-2b documents, which is a | ||||
|         subset of the entire PDF specification and meant for storing | ||||
|         documents long term. | ||||
|     *   ``pdfa-1``, ``pdfa-2``, ``pdfa-3`` to specify the exact version of | ||||
|         PDF/A you wish to use. | ||||
|      | ||||
|     If not specified, ``pdfa`` is used. Remember that paperless also keeps | ||||
|     the original input file as well as the archived version. | ||||
|  | ||||
|  | ||||
| PAPERLESS_OCR_PAGES=<num> | ||||
|     Tells paperless to use only the specified amount of pages for OCR. Documents | ||||
|     with less than the specified amount of pages get OCR'ed completely. | ||||
|  | ||||
|     Specifying 1 here will only use the first page. | ||||
|  | ||||
|     When combined with ``PAPERLESS_OCR_MODE=redo`` or ``PAPERLESS_OCR_MODE=force``, | ||||
|     paperless will not modify any text it finds on excluded pages and copy it | ||||
|     verbatim. | ||||
|  | ||||
|     Defaults to 0, which disables this feature and always uses all pages. | ||||
|  | ||||
|  | ||||
| PAPERLESS_OCR_IMAGE_DPI=<num> | ||||
|     Paperless will OCR any images you put into the system and convert them | ||||
|     into PDF documents. This is useful if your scanner produces images. | ||||
|     In order to do so, paperless needs to know the DPI of the image. | ||||
|     Most images from scanners will have this information embedded and | ||||
|     paperless will detect and use that information. In case this fails, it | ||||
|     uses this value as a fallback. | ||||
|  | ||||
|     Set this to the DPI your scanner produces images at. | ||||
|  | ||||
|     Default is none, which causes paperless to fail if no DPI information is | ||||
|     present in an image. | ||||
|  | ||||
|  | ||||
| PAPERLESS_OCR_USER_ARG=<json> | ||||
|     OCRmyPDF offers many more options. Use this parameter to specify any | ||||
|     additional arguments you wish to pass to OCRmyPDF. Since Paperless uses | ||||
|     the API of OCRmyPDF, you have to specify these in a format that can be | ||||
|     passed to the API. See `https://ocrmypdf.readthedocs.io/en/latest/api.html#reference`_ | ||||
|     for valid parameters. All command line options are supported, but they | ||||
|     use underscores instead of dashed. | ||||
|  | ||||
|     .. caution:: | ||||
|  | ||||
|         Paperless has been tested to work with the OCR options provided | ||||
|         above. There are many options that are incompatible with each other, | ||||
|         so specifying invalid options may prevent paperless from consuming | ||||
|         any documents. | ||||
|  | ||||
|     Specify arguments as a JSON dictionary. Keep note of lower case booleans | ||||
|     and double quoted parameter names and strings. Examples: | ||||
|  | ||||
|     .. code:: json | ||||
|  | ||||
|         {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}     | ||||
|      | ||||
|      | ||||
| Software tweaks | ||||
| ############### | ||||
|  | ||||
| @@ -193,37 +304,6 @@ PAPERLESS_TIME_ZONE=<timezone> | ||||
|     Defaults to UTC. | ||||
|  | ||||
|  | ||||
|  | ||||
| PAPERLESS_OCR_PAGES=<num> | ||||
|     Tells paperless to use only the specified amount of pages for OCR. Documents | ||||
|     with less than the specified amount of pages get OCR'ed completely. | ||||
|  | ||||
|     Specifying 1 here will only use the first page. | ||||
|  | ||||
|     Defaults to 0, which disables this feature and always uses all pages. | ||||
|  | ||||
|  | ||||
|  | ||||
| PAPERLESS_OCR_LANGUAGE=<lang> | ||||
|     Customize the default language that tesseract will attempt to use when | ||||
|     parsing documents. The default language is used whenever | ||||
|  | ||||
|     * No language could be detected on a document | ||||
|     * No tesseract data files are available for the detected language | ||||
|  | ||||
|     It should be a 3-letter language code consistent with ISO | ||||
|     639: https://www.loc.gov/standards/iso639-2/php/code_list.php | ||||
|  | ||||
|     Set this to the language most of your documents are written in. | ||||
|  | ||||
|     Defaults to "eng". | ||||
|  | ||||
| PAPERLESS_OCR_ALWAYS=<bool> | ||||
|     By default Paperless does not OCR a document if the text can be retrieved from | ||||
|     the document directly. Set to true to always OCR documents. | ||||
|  | ||||
|     Defaults to false. | ||||
|  | ||||
| PAPERLESS_CONSUMER_POLLING=<num> | ||||
|     If paperless won't find documents added to your consume folder, it might | ||||
|     not be able to automatically detect filesystem changes. In that case, | ||||
| @@ -261,18 +341,6 @@ PAPERLESS_CONVERT_TMPDIR=<path> | ||||
|  | ||||
|     Default is none, which disables the temporary directory. | ||||
|  | ||||
| PAPERLESS_CONVERT_DENSITY=<num> | ||||
|     This setting has a high impact on the physical size of tmp page files, | ||||
|     the speed of document conversion, and can affect the accuracy of OCR | ||||
|     results. Individual results can vary and this setting should be tested | ||||
|     thoroughly against the documents you are importing to see if it has any | ||||
|     impacts either negative or positive. | ||||
|     Testing on limited document sets has shown a setting of 200 can cut the | ||||
|     size of tmp files by 1/3, and speed up conversion by up to 4x | ||||
|     with little impact to OCR accuracy. | ||||
|  | ||||
|     Default is 300. | ||||
|  | ||||
| PAPERLESS_OPTIMIZE_THUMBNAILS=<bool> | ||||
|     Use optipng to optimize thumbnails. This usually reduces the size of | ||||
|     thumbnails by about 20%, but uses considerable compute time during | ||||
| @@ -319,8 +387,5 @@ PAPERLESS_CONVERT_BINARY=<path> | ||||
| PAPERLESS_GS_BINARY=<path> | ||||
|     Defaults to "/usr/bin/gs". | ||||
|  | ||||
| PAPERLESS_UNPAPER_BINARY=<path> | ||||
|     Defaults to "/usr/bin/unpaper". | ||||
|  | ||||
| PAPERLESS_OPTIPNG_BINARY=<path> | ||||
|     Defaults to "/usr/bin/optipng". | ||||
|   | ||||
| @@ -31,19 +31,24 @@ | ||||
| #PAPERLESS_STATIC_URL=/static/ | ||||
| #PAPERLESS_AUTO_LOGIN_USERNAME= | ||||
|  | ||||
| # OCR settings | ||||
|  | ||||
| #PAPERLESS_OCR_LANGUAGE=eng | ||||
| #PAPERLESS_OCR_MODE=skip | ||||
| #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | ||||
| #PAPERLESS_OCR_PAGES=1 | ||||
| #PAPERLESS_OCR_IMAGE_DPI=300 | ||||
| #PAPERLESS_OCR_USER_ARG={} | ||||
| #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | ||||
| #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless | ||||
|  | ||||
| # Software tweaks | ||||
|  | ||||
| #PAPERLESS_TASK_WORKERS=1 | ||||
| #PAPERLESS_THREADS_PER_WORKER=1 | ||||
| #PAPERLESS_TIME_ZONE=UTC | ||||
| #PAPERLESS_OCR_PAGES=1 | ||||
| #PAPERLESS_OCR_LANGUAGE=eng | ||||
| #PAPERLESS_OCR_ALWAYS=false | ||||
| #PAPERLESS_CONSUMER_POLLING=10 | ||||
| #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | ||||
| #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | ||||
| #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless | ||||
| #PAPERLESS_CONVERT_DENSITY=300 | ||||
| #PAPERLESS_OPTIMIZE_THUMBNAILS=true | ||||
| #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | ||||
| #PAPERLESS_FILENAME_DATE_ORDER=YMD | ||||
| @@ -53,5 +58,4 @@ | ||||
|  | ||||
| #PAPERLESS_CONVERT_BINARY=/usr/bin/convert | ||||
| #PAPERLESS_GS_BINARY=/usr/bin/gs | ||||
| #PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper | ||||
| #PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng | ||||
|   | ||||
| @@ -5,12 +5,26 @@ | ||||
|         </svg> | ||||
|         <span class="d-none d-lg-inline"> Delete</span> | ||||
|     </button> | ||||
|     <a [href]="downloadUrl" class="btn btn-sm btn-outline-primary mr-2"> | ||||
|  | ||||
|     <div class="btn-group mr-2"> | ||||
|  | ||||
|         <a [href]="downloadUrl" class="btn btn-sm btn-outline-primary"> | ||||
|             <svg class="buttonicon" fill="currentColor"> | ||||
|                 <use xlink:href="assets/bootstrap-icons.svg#download" /> | ||||
|             </svg> | ||||
|             <span class="d-none d-lg-inline"> Download</span> | ||||
|         </a> | ||||
|      | ||||
|         <div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version"> | ||||
|           <button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button> | ||||
|           <div class="dropdown-menu" ngbDropdownMenu> | ||||
|             <a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a> | ||||
|           </div> | ||||
|         </div> | ||||
|      | ||||
|       </div> | ||||
|  | ||||
|  | ||||
|     <button type="button" class="btn btn-sm btn-outline-primary" (click)="close()"> | ||||
|         <svg class="buttonicon" fill="currentColor"> | ||||
|             <use xlink:href="assets/bootstrap-icons.svg#x" /> | ||||
|   | ||||
| @@ -4,6 +4,7 @@ import { ActivatedRoute, Router } from '@angular/router'; | ||||
| import { NgbModal } from '@ng-bootstrap/ng-bootstrap'; | ||||
| import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent'; | ||||
| import { PaperlessDocument } from 'src/app/data/paperless-document'; | ||||
| import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata'; | ||||
| import { PaperlessDocumentType } from 'src/app/data/paperless-document-type'; | ||||
| import { DocumentListViewService } from 'src/app/services/document-list-view.service'; | ||||
| import { OpenDocumentsService } from 'src/app/services/open-documents.service'; | ||||
| @@ -23,9 +24,11 @@ export class DocumentDetailComponent implements OnInit { | ||||
|  | ||||
|   documentId: number | ||||
|   document: PaperlessDocument | ||||
|   metadata: PaperlessDocumentMetadata | ||||
|   title: string | ||||
|   previewUrl: string | ||||
|   downloadUrl: string | ||||
|   downloadOriginalUrl: string | ||||
|  | ||||
|   correspondents: PaperlessCorrespondent[] | ||||
|   documentTypes: PaperlessDocumentType[] | ||||
| @@ -62,6 +65,7 @@ export class DocumentDetailComponent implements OnInit { | ||||
|       this.documentId = +paramMap.get('id') | ||||
|       this.previewUrl = this.documentsService.getPreviewUrl(this.documentId) | ||||
|       this.downloadUrl = this.documentsService.getDownloadUrl(this.documentId) | ||||
|       this.downloadOriginalUrl = this.documentsService.getDownloadUrl(this.documentId, true) | ||||
|       if (this.openDocumentService.getOpenDocument(this.documentId)) { | ||||
|         this.updateComponent(this.openDocumentService.getOpenDocument(this.documentId)) | ||||
|       } else { | ||||
| @@ -76,6 +80,9 @@ export class DocumentDetailComponent implements OnInit { | ||||
|  | ||||
|   updateComponent(doc: PaperlessDocument) { | ||||
|     this.document = doc | ||||
|     this.documentsService.getMetadata(doc.id).subscribe(result => { | ||||
|       this.metadata = result | ||||
|     }) | ||||
|     this.title = doc.title | ||||
|     this.documentForm.patchValue(doc) | ||||
|   } | ||||
|   | ||||
							
								
								
									
										11
									
								
								src-ui/src/app/data/paperless-document-metadata.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,11 @@ | ||||
| export interface PaperlessDocumentMetadata { | ||||
|      | ||||
|   paperless__checksum?: string | ||||
|  | ||||
|   paperless__mime_type?: string | ||||
|  | ||||
|   paperless__filename?: string | ||||
|  | ||||
|   paperless__has_archive_version?: boolean | ||||
|  | ||||
| } | ||||
| @@ -1,5 +1,6 @@ | ||||
| import { Injectable } from '@angular/core'; | ||||
| import { PaperlessDocument } from 'src/app/data/paperless-document'; | ||||
| import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata'; | ||||
| import { AbstractPaperlessService } from './abstract-paperless-service'; | ||||
| import { HttpClient } from '@angular/common/http'; | ||||
| import { Observable } from 'rxjs'; | ||||
| @@ -50,20 +51,32 @@ export class DocumentService extends AbstractPaperlessService<PaperlessDocument> | ||||
|     return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules)) | ||||
|   } | ||||
|  | ||||
|   getPreviewUrl(id: number): string { | ||||
|     return this.getResourceUrl(id, 'preview') | ||||
|   getPreviewUrl(id: number, original: boolean = false): string { | ||||
|     let url = this.getResourceUrl(id, 'preview') | ||||
|     if (original) { | ||||
|       url += "?original=true" | ||||
|     } | ||||
|     return url | ||||
|   } | ||||
|  | ||||
|   getThumbUrl(id: number): string { | ||||
|     return this.getResourceUrl(id, 'thumb') | ||||
|   } | ||||
|  | ||||
|   getDownloadUrl(id: number): string { | ||||
|     return this.getResourceUrl(id, 'download') | ||||
|   getDownloadUrl(id: number, original: boolean = false): string { | ||||
|     let url = this.getResourceUrl(id, 'download') | ||||
|     if (original) { | ||||
|       url += "?original=true" | ||||
|     } | ||||
|     return url | ||||
|   } | ||||
|  | ||||
|   uploadDocument(formData) { | ||||
|     return this.http.post(this.getResourceUrl(null, 'post_document'), formData) | ||||
|   } | ||||
|  | ||||
|   getMetadata(id: number): Observable<PaperlessDocumentMetadata> { | ||||
|     return this.http.get<PaperlessDocumentMetadata>(this.getResourceUrl(id, 'metadata')) | ||||
|   } | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -6,6 +6,7 @@ import os | ||||
| import magic | ||||
| from django.conf import settings | ||||
| from django.db import transaction | ||||
| from django.db.models import Q | ||||
| from django.utils import timezone | ||||
|  | ||||
| from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||
| @@ -13,7 +14,7 @@ from .file_handling import create_source_path_directory | ||||
| from .loggers import LoggingMixin | ||||
| from .models import Document, FileInfo, Correspondent, DocumentType, Tag | ||||
| from .parsers import ParseError, get_parser_class_for_mime_type, \ | ||||
|     get_supported_file_extensions | ||||
|     get_supported_file_extensions, parse_date | ||||
| from .signals import ( | ||||
|     document_consumption_finished, | ||||
|     document_consumption_started | ||||
| @@ -58,7 +59,7 @@ class Consumer(LoggingMixin): | ||||
|     def pre_check_duplicate(self): | ||||
|         with open(self.path, "rb") as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|         if Document.objects.filter(checksum=checksum).exists(): | ||||
|         if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501 | ||||
|             if settings.CONSUMER_DELETE_DUPLICATES: | ||||
|                 os.unlink(self.path) | ||||
|             raise ConsumerError( | ||||
| @@ -69,6 +70,7 @@ class Consumer(LoggingMixin): | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) | ||||
|  | ||||
|     def try_consume_file(self, | ||||
|                          path, | ||||
| @@ -124,7 +126,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         # This doesn't parse the document yet, but gives us a parser. | ||||
|  | ||||
|         document_parser = parser_class(self.path, self.logging_group) | ||||
|         document_parser = parser_class(self.logging_group) | ||||
|  | ||||
|         # However, this already created working directories which we have to | ||||
|         # clean up. | ||||
| @@ -132,13 +134,24 @@ class Consumer(LoggingMixin): | ||||
|         # Parse the document. This may take some time. | ||||
|  | ||||
|         try: | ||||
|             self.log("debug", f"Generating thumbnail for {self.filename}...") | ||||
|             thumbnail = document_parser.get_optimised_thumbnail() | ||||
|             self.log("debug", "Parsing {}...".format(self.filename)) | ||||
|             document_parser.parse(self.path, mime_type) | ||||
|  | ||||
|             self.log("debug", f"Generating thumbnail for {self.filename}...") | ||||
|             thumbnail = document_parser.get_optimised_thumbnail( | ||||
|                 self.path, mime_type) | ||||
|  | ||||
|             text = document_parser.get_text() | ||||
|             date = document_parser.get_date() | ||||
|             if not date: | ||||
|                 date = parse_date(self.filename, text) | ||||
|             archive_path = document_parser.get_archive_path() | ||||
|  | ||||
|         except ParseError as e: | ||||
|             document_parser.cleanup() | ||||
|             self.log( | ||||
|                 "error", | ||||
|                 f"Error while consuming document {self.filename}: {e}") | ||||
|             raise ConsumerError(e) | ||||
|  | ||||
|         # Prepare the document classifier. | ||||
| @@ -180,9 +193,24 @@ class Consumer(LoggingMixin): | ||||
|                 # After everything is in the database, copy the files into | ||||
|                 # place. If this fails, we'll also rollback the transaction. | ||||
|  | ||||
|                 # TODO: not required, since this is done by the file handling | ||||
|                 #  logic | ||||
|                 create_source_path_directory(document.source_path) | ||||
|                 self._write(document, self.path, document.source_path) | ||||
|                 self._write(document, thumbnail, document.thumbnail_path) | ||||
|  | ||||
|                 self._write(document.storage_type, | ||||
|                             self.path, document.source_path) | ||||
|  | ||||
|                 self._write(document.storage_type, | ||||
|                             thumbnail, document.thumbnail_path) | ||||
|  | ||||
|                 if archive_path and os.path.isfile(archive_path): | ||||
|                     self._write(document.storage_type, | ||||
|                                 archive_path, document.archive_path) | ||||
|  | ||||
|                     with open(archive_path, 'rb') as f: | ||||
|                         document.archive_checksum = hashlib.md5( | ||||
|                             f.read()).hexdigest() | ||||
|                         document.save() | ||||
|  | ||||
|                 # Afte performing all database operations and moving files | ||||
|                 # into place, tell paperless where the file is. | ||||
| @@ -195,6 +223,11 @@ class Consumer(LoggingMixin): | ||||
|                 self.log("debug", "Deleting file {}".format(self.path)) | ||||
|                 os.unlink(self.path) | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 "error", | ||||
|                 f"The following error occured while consuming " | ||||
|                 f"{self.filename}: {e}" | ||||
|             ) | ||||
|             raise ConsumerError(e) | ||||
|         finally: | ||||
|             document_parser.cleanup() | ||||
| @@ -259,7 +292,7 @@ class Consumer(LoggingMixin): | ||||
|             for tag_id in self.override_tag_ids: | ||||
|                 document.tags.add(Tag.objects.get(pk=tag_id)) | ||||
|  | ||||
|     def _write(self, document, source, target): | ||||
|     def _write(self, storage_type, source, target): | ||||
|         with open(source, "rb") as read_file: | ||||
|             with open(target, "wb") as write_file: | ||||
|                 write_file.write(read_file.read()) | ||||
|   | ||||
| @@ -10,10 +10,13 @@ def create_source_path_directory(source_path): | ||||
|     os.makedirs(os.path.dirname(source_path), exist_ok=True) | ||||
|  | ||||
|  | ||||
| def delete_empty_directories(directory): | ||||
| def delete_empty_directories(directory, root): | ||||
|     if not os.path.isdir(directory): | ||||
|         return | ||||
|  | ||||
|     # Go up in the directory hierarchy and try to delete all directories | ||||
|     directory = os.path.normpath(directory) | ||||
|     root = os.path.normpath(settings.ORIGINALS_DIR) | ||||
|     root = os.path.normpath(root) | ||||
|  | ||||
|     if not directory.startswith(root + os.path.sep): | ||||
|         # don't do anything outside our originals folder. | ||||
| @@ -101,3 +104,8 @@ def generate_filename(doc): | ||||
|         filename += ".gpg" | ||||
|  | ||||
|     return filename | ||||
|  | ||||
|  | ||||
| def archive_name_from_filename(filename): | ||||
|  | ||||
|     return os.path.splitext(filename)[0] + ".pdf" | ||||
|   | ||||
							
								
								
									
										89
									
								
								src/documents/management/commands/document_archiver.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,89 @@ | ||||
| import hashlib | ||||
| import multiprocessing | ||||
|  | ||||
| import logging | ||||
| import os | ||||
| import shutil | ||||
| import uuid | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.core.management.base import BaseCommand | ||||
| from whoosh.writing import AsyncWriter | ||||
|  | ||||
| from documents.models import Document | ||||
| from ... import index | ||||
| from ...mixins import Renderable | ||||
| from ...parsers import get_parser_class_for_mime_type | ||||
|  | ||||
|  | ||||
| def handle_document(document): | ||||
|     mime_type = document.mime_type | ||||
|  | ||||
|     parser_class = get_parser_class_for_mime_type(mime_type) | ||||
|  | ||||
|     parser = parser_class(logging_group=uuid.uuid4()) | ||||
|     parser.parse(document.source_path, mime_type) | ||||
|     if parser.get_archive_path(): | ||||
|         shutil.copy(parser.get_archive_path(), document.archive_path) | ||||
|         with document.archive_file as f: | ||||
|             document.archive_checksum = hashlib.md5(f.read()).hexdigest() | ||||
|     else: | ||||
|         logging.getLogger(__name__).warning( | ||||
|             f"Parser {parser} did not produce an archived document " | ||||
|             f"for {document.file_name}" | ||||
|         ) | ||||
|  | ||||
|     if parser.get_text(): | ||||
|         document.content = parser.get_text() | ||||
|     document.save() | ||||
|  | ||||
|     parser.cleanup() | ||||
|  | ||||
|  | ||||
| class Command(Renderable, BaseCommand): | ||||
|  | ||||
|     help = """ | ||||
|         Using the current classification model, assigns correspondents, tags | ||||
|         and document types to all documents, effectively allowing you to | ||||
|         back-tag all previously indexed documents with metadata created (or | ||||
|         modified) after their initial import. | ||||
|     """.replace("    ", "") | ||||
|  | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         self.verbosity = 0 | ||||
|         BaseCommand.__init__(self, *args, **kwargs) | ||||
|  | ||||
|     def add_arguments(self, parser): | ||||
|         parser.add_argument( | ||||
|             "-f", "--overwrite", | ||||
|             default=False, | ||||
|             action="store_true", | ||||
|             help="Recreates the archived document for documents that already " | ||||
|                  "have an archived version." | ||||
|         ) | ||||
|  | ||||
|     def handle(self, *args, **options): | ||||
|  | ||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||
|  | ||||
|         overwrite = options["overwrite"] | ||||
|  | ||||
|         documents = Document.objects.all() | ||||
|  | ||||
|         documents_to_process = filter( | ||||
|             lambda d: overwrite or not os.path.exists(d.archive_path), | ||||
|             documents | ||||
|         ) | ||||
|  | ||||
|         with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool: | ||||
|             list( | ||||
|                 pool.imap( | ||||
|                     handle_document, | ||||
|                     list(documents_to_process) | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|         ix = index.open_index() | ||||
|         with AsyncWriter(ix) as writer: | ||||
|             for d in documents_to_process: | ||||
|                 index.update_document(writer, d) | ||||
| @@ -7,7 +7,8 @@ from django.core import serializers | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
|  | ||||
| from documents.models import Document, Correspondent, Tag, DocumentType | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ | ||||
|     EXPORTER_ARCHIVE_NAME | ||||
| from paperless.db import GnuPG | ||||
| from ...mixins import Renderable | ||||
|  | ||||
| @@ -54,7 +55,6 @@ class Command(Renderable, BaseCommand): | ||||
|             document = document_map[document_dict["pk"]] | ||||
|  | ||||
|             unique_filename = f"{document.pk:07}_{document.file_name}" | ||||
|  | ||||
|             file_target = os.path.join(self.target, unique_filename) | ||||
|  | ||||
|             thumbnail_name = unique_filename + "-thumbnail.png" | ||||
| @@ -63,6 +63,14 @@ class Command(Renderable, BaseCommand): | ||||
|             document_dict[EXPORTER_FILE_NAME] = unique_filename | ||||
|             document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name | ||||
|  | ||||
|             if os.path.exists(document.archive_path): | ||||
|                 archive_name = \ | ||||
|                     f"{document.pk:07}_archive_{document.archive_file_name}" | ||||
|                 archive_target = os.path.join(self.target, archive_name) | ||||
|                 document_dict[EXPORTER_ARCHIVE_NAME] = archive_name | ||||
|             else: | ||||
|                 archive_target = None | ||||
|  | ||||
|             print(f"Exporting: {file_target}") | ||||
|  | ||||
|             t = int(time.mktime(document.created.timetuple())) | ||||
| @@ -76,11 +84,18 @@ class Command(Renderable, BaseCommand): | ||||
|                     f.write(GnuPG.decrypted(document.thumbnail_file)) | ||||
|                     os.utime(thumbnail_target, times=(t, t)) | ||||
|  | ||||
|                 if archive_target: | ||||
|                     with open(archive_target, "wb") as f: | ||||
|                         f.write(GnuPG.decrypted(document.archive_path)) | ||||
|                         os.utime(archive_target, times=(t, t)) | ||||
|             else: | ||||
|  | ||||
|                 shutil.copy(document.source_path, file_target) | ||||
|                 shutil.copy(document.thumbnail_path, thumbnail_target) | ||||
|  | ||||
|                 if archive_target: | ||||
|                     shutil.copy(document.archive_path, archive_target) | ||||
|  | ||||
|         manifest += json.loads( | ||||
|             serializers.serialize("json", Correspondent.objects.all())) | ||||
|  | ||||
|   | ||||
| @@ -7,8 +7,8 @@ from django.core.management import call_command | ||||
| from django.core.management.base import BaseCommand, CommandError | ||||
|  | ||||
| from documents.models import Document | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME | ||||
| from paperless.db import GnuPG | ||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ | ||||
|     EXPORTER_ARCHIVE_NAME | ||||
| from ...file_handling import generate_filename, create_source_path_directory | ||||
| from ...mixins import Renderable | ||||
|  | ||||
| @@ -79,23 +79,41 @@ class Command(Renderable, BaseCommand): | ||||
|                     'appear to be in the source directory.'.format(doc_file) | ||||
|                 ) | ||||
|  | ||||
|             if EXPORTER_ARCHIVE_NAME in record: | ||||
|                 archive_file = record[EXPORTER_ARCHIVE_NAME] | ||||
|                 if not os.path.exists(os.path.join(self.source, archive_file)): | ||||
|                     raise CommandError( | ||||
|                         f"The manifest file refers to {archive_file} which " | ||||
|                         f"does not appear to be in the source directory." | ||||
|                     ) | ||||
|  | ||||
|     def _import_files_from_manifest(self): | ||||
|  | ||||
|         storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) | ||||
|         os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) | ||||
|  | ||||
|         for record in self.manifest: | ||||
|  | ||||
|             if not record["model"] == "documents.document": | ||||
|                 continue | ||||
|  | ||||
|             doc_file = record[EXPORTER_FILE_NAME] | ||||
|             thumb_file = record[EXPORTER_THUMBNAIL_NAME] | ||||
|             document = Document.objects.get(pk=record["pk"]) | ||||
|  | ||||
|             doc_file = record[EXPORTER_FILE_NAME] | ||||
|             document_path = os.path.join(self.source, doc_file) | ||||
|  | ||||
|             thumb_file = record[EXPORTER_THUMBNAIL_NAME] | ||||
|             thumbnail_path = os.path.join(self.source, thumb_file) | ||||
|  | ||||
|             document.storage_type = storage_type | ||||
|             if EXPORTER_ARCHIVE_NAME in record: | ||||
|                 archive_file = record[EXPORTER_ARCHIVE_NAME] | ||||
|                 archive_path = os.path.join(self.source, archive_file) | ||||
|             else: | ||||
|                 archive_path = None | ||||
|  | ||||
|             document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|  | ||||
|             document.filename = generate_filename(document) | ||||
|  | ||||
|             if os.path.isfile(document.source_path): | ||||
| @@ -106,5 +124,7 @@ class Command(Renderable, BaseCommand): | ||||
|             print(f"Moving {document_path} to {document.source_path}") | ||||
|             shutil.copy(document_path, document.source_path) | ||||
|             shutil.copy(thumbnail_path, document.thumbnail_path) | ||||
|             if archive_path: | ||||
|                 shutil.copy(archive_path, document.archive_path) | ||||
|  | ||||
|             document.save() | ||||
|   | ||||
							
								
								
									
										23
									
								
								src/documents/migrations/1005_checksums.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,23 @@ | ||||
| # Generated by Django 3.1.3 on 2020-11-29 00:48 | ||||
|  | ||||
| from django.db import migrations, models | ||||
|  | ||||
|  | ||||
| class Migration(migrations.Migration): | ||||
|  | ||||
|     dependencies = [ | ||||
|         ('documents', '1004_sanity_check_schedule'), | ||||
|     ] | ||||
|  | ||||
|     operations = [ | ||||
|         migrations.AddField( | ||||
|             model_name='document', | ||||
|             name='archive_checksum', | ||||
|             field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True), | ||||
|         ), | ||||
|         migrations.AlterField( | ||||
|             model_name='document', | ||||
|             name='checksum', | ||||
|             field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True), | ||||
|         ), | ||||
|     ] | ||||
| @@ -11,6 +11,7 @@ from django.db import models | ||||
| from django.utils import timezone | ||||
| from django.utils.text import slugify | ||||
|  | ||||
| from documents.file_handling import archive_name_from_filename | ||||
| from documents.parsers import get_default_file_extension | ||||
|  | ||||
|  | ||||
| @@ -158,9 +159,15 @@ class Document(models.Model): | ||||
|         max_length=32, | ||||
|         editable=False, | ||||
|         unique=True, | ||||
|         help_text="The checksum of the original document (before it was " | ||||
|                   "encrypted).  We use this to prevent duplicate document " | ||||
|                   "imports." | ||||
|         help_text="The checksum of the original document." | ||||
|     ) | ||||
|  | ||||
|     archive_checksum = models.CharField( | ||||
|         max_length=32, | ||||
|         editable=False, | ||||
|         blank=True, | ||||
|         null=True, | ||||
|         help_text="The checksum of the archived document." | ||||
|     ) | ||||
|  | ||||
|     created = models.DateTimeField( | ||||
| @@ -225,10 +232,30 @@ class Document(models.Model): | ||||
|     def source_file(self): | ||||
|         return open(self.source_path, "rb") | ||||
|  | ||||
|     @property | ||||
|     def archive_path(self): | ||||
|         if self.filename: | ||||
|             fname = archive_name_from_filename(self.filename) | ||||
|         else: | ||||
|             fname = "{:07}.pdf".format(self.pk) | ||||
|  | ||||
|         return os.path.join( | ||||
|             settings.ARCHIVE_DIR, | ||||
|             fname | ||||
|         ) | ||||
|  | ||||
|     @property | ||||
|     def archive_file(self): | ||||
|         return open(self.archive_path, "rb") | ||||
|  | ||||
|     @property | ||||
|     def file_name(self): | ||||
|         return slugify(str(self)) + self.file_type | ||||
|  | ||||
|     @property | ||||
|     def archive_file_name(self): | ||||
|         return slugify(str(self)) + ".pdf" | ||||
|  | ||||
|     @property | ||||
|     def file_type(self): | ||||
|         return get_default_file_extension(self.mime_type) | ||||
|   | ||||
| @@ -131,73 +131,7 @@ def run_convert(input_file, | ||||
|         raise ParseError("Convert failed at {}".format(args)) | ||||
|  | ||||
|  | ||||
| def run_unpaper(pnm, logging_group=None): | ||||
|     pnm_out = pnm.replace(".pnm", ".unpaper.pnm") | ||||
|  | ||||
|     command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, | ||||
|                     pnm_out) | ||||
|  | ||||
|     logger.debug(f"Execute: {' '.join(command_args)}", | ||||
|                  extra={'group': logging_group}) | ||||
|  | ||||
|     if not subprocess.Popen(command_args, | ||||
|                             stdout=subprocess.DEVNULL, | ||||
|                             stderr=subprocess.DEVNULL).wait() == 0: | ||||
|         raise ParseError(f"Unpaper failed at {command_args}") | ||||
|  | ||||
|     return pnm_out | ||||
|  | ||||
|  | ||||
| class ParseError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class DocumentParser(LoggingMixin): | ||||
|     """ | ||||
|     Subclass this to make your own parser.  Have a look at | ||||
|     `paperless_tesseract.parsers` for inspiration. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path, logging_group): | ||||
|         super().__init__() | ||||
|         self.logging_group = logging_group | ||||
|         self.document_path = path | ||||
|         self.tempdir = tempfile.mkdtemp( | ||||
|             prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
|         Returns the path to a file we can use as a thumbnail for this document. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def optimise_thumbnail(self, in_path): | ||||
|  | ||||
|         if settings.OPTIMIZE_THUMBNAILS: | ||||
|             out_path = os.path.join(self.tempdir, "optipng.png") | ||||
|  | ||||
|             args = (settings.OPTIPNG_BINARY, | ||||
|                     "-silent", "-o5", in_path, "-out", out_path) | ||||
|  | ||||
|             self.log('debug', f"Execute: {' '.join(args)}") | ||||
|  | ||||
|             if not subprocess.Popen(args).wait() == 0: | ||||
|                 raise ParseError("Optipng failed at {}".format(args)) | ||||
|  | ||||
|             return out_path | ||||
|         else: | ||||
|             return in_path | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|         return self.optimise_thumbnail(self.get_thumbnail()) | ||||
|  | ||||
|     def get_text(self): | ||||
|         """ | ||||
|         Returns the text from the document and only the text. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def get_date(self): | ||||
| def parse_date(filename, text): | ||||
|     """ | ||||
|     Returns the date of the document. | ||||
|     """ | ||||
| @@ -217,15 +151,12 @@ class DocumentParser(LoggingMixin): | ||||
|         ) | ||||
|  | ||||
|     date = None | ||||
|         date_string = None | ||||
|  | ||||
|     next_year = timezone.now().year + 5  # Arbitrary 5 year future limit | ||||
|         title = os.path.basename(self.document_path) | ||||
|  | ||||
|     # if filename date parsing is enabled, search there first: | ||||
|     if settings.FILENAME_DATE_ORDER: | ||||
|             self.log("info", "Checking document title for date") | ||||
|             for m in re.finditer(DATE_REGEX, title): | ||||
|         for m in re.finditer(DATE_REGEX, filename): | ||||
|             date_string = m.group(0) | ||||
|  | ||||
|             try: | ||||
| @@ -235,21 +166,8 @@ class DocumentParser(LoggingMixin): | ||||
|                 continue | ||||
|  | ||||
|             if date is not None and next_year > date.year > 1900: | ||||
|                     self.log( | ||||
|                         "info", | ||||
|                         "Detected document date {} based on string {} " | ||||
|                         "from document title" | ||||
|                         "".format(date.isoformat(), date_string) | ||||
|                     ) | ||||
|                 return date | ||||
|  | ||||
|         try: | ||||
|             # getting text after checking filename will save time if only | ||||
|             # looking at the filename instead of the whole text | ||||
|             text = self.get_text() | ||||
|         except ParseError: | ||||
|             return None | ||||
|  | ||||
|     # Iterate through all regex matches in text and try to parse the date | ||||
|     for m in re.finditer(DATE_REGEX, text): | ||||
|         date_string = m.group(0) | ||||
| @@ -265,19 +183,64 @@ class DocumentParser(LoggingMixin): | ||||
|         else: | ||||
|             date = None | ||||
|  | ||||
|         if date is not None: | ||||
|             self.log( | ||||
|                 "info", | ||||
|                 "Detected document date {} based on string {}".format( | ||||
|                     date.isoformat(), | ||||
|                     date_string | ||||
|                 ) | ||||
|             ) | ||||
|         else: | ||||
|             self.log("info", "Unable to detect date for document") | ||||
|  | ||||
|     return date | ||||
|  | ||||
|  | ||||
| class ParseError(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| class DocumentParser(LoggingMixin): | ||||
|     """ | ||||
|     Subclass this to make your own parser.  Have a look at | ||||
|     `paperless_tesseract.parsers` for inspiration. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, logging_group): | ||||
|         super().__init__() | ||||
|         self.logging_group = logging_group | ||||
|         self.tempdir = tempfile.mkdtemp( | ||||
|             prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||
|  | ||||
|         self.archive_path = None | ||||
|         self.text = None | ||||
|         self.date = None | ||||
|  | ||||
|     def parse(self, document_path, mime_type): | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def get_archive_path(self): | ||||
|         return self.archive_path | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         Returns the path to a file we can use as a thumbnail for this document. | ||||
|         """ | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def get_optimised_thumbnail(self, document_path, mime_type): | ||||
|         thumbnail = self.get_thumbnail(document_path, mime_type) | ||||
|         if settings.OPTIMIZE_THUMBNAILS: | ||||
|             out_path = os.path.join(self.tempdir, "thumb_optipng.png") | ||||
|  | ||||
|             args = (settings.OPTIPNG_BINARY, | ||||
|                     "-silent", "-o5", thumbnail, "-out", out_path) | ||||
|  | ||||
|             self.log('debug', f"Execute: {' '.join(args)}") | ||||
|  | ||||
|             if not subprocess.Popen(args).wait() == 0: | ||||
|                 raise ParseError("Optipng failed at {}".format(args)) | ||||
|  | ||||
|             return out_path | ||||
|         else: | ||||
|             return thumbnail | ||||
|  | ||||
|     def get_text(self): | ||||
|         return self.text | ||||
|  | ||||
|     def get_date(self): | ||||
|         return self.date | ||||
|  | ||||
|     def cleanup(self): | ||||
|         self.log("debug", "Deleting directory {}".format(self.tempdir)) | ||||
|         shutil.rmtree(self.tempdir) | ||||
|   | ||||
| @@ -67,20 +67,35 @@ def check_sanity(): | ||||
|                 f"Original of document {doc.pk} does not exist.")) | ||||
|         else: | ||||
|             present_files.remove(os.path.normpath(doc.source_path)) | ||||
|             checksum = None | ||||
|             try: | ||||
|                 with doc.source_file as f: | ||||
|                     checksum = hashlib.md5(f.read()).hexdigest() | ||||
|             except OSError as e: | ||||
|                 messages.append(SanityError( | ||||
|                     f"Cannot read original file of document {doc.pk}: {e}")) | ||||
|  | ||||
|             if checksum and not checksum == doc.checksum: | ||||
|             else: | ||||
|                 if not checksum == doc.checksum: | ||||
|                     messages.append(SanityError( | ||||
|                         f"Checksum mismatch of document {doc.pk}. " | ||||
|                         f"Stored: {doc.checksum}, actual: {checksum}." | ||||
|                     )) | ||||
|  | ||||
|         if os.path.isfile(doc.archive_path): | ||||
|             present_files.remove(os.path.normpath(doc.archive_path)) | ||||
|             try: | ||||
|                 with doc.archive_file as f: | ||||
|                     checksum = hashlib.md5(f.read()).hexdigest() | ||||
|             except OSError as e: | ||||
|                 messages.append(SanityError( | ||||
|                     f"Cannot read archive file of document {doc.pk}: {e}" | ||||
|                 )) | ||||
|             else: | ||||
|                 if not checksum == doc.archive_checksum: | ||||
|                     messages.append(SanityError( | ||||
|                         f"Checksum mismatch of archive {doc.pk}. " | ||||
|                         f"Stored: {doc.checksum}, actual: {checksum}." | ||||
|                     )) | ||||
|  | ||||
|         if not doc.content: | ||||
|             messages.append(SanityWarning( | ||||
|                 f"Document {doc.pk} has no content." | ||||
|   | ||||
| @@ -2,3 +2,4 @@ | ||||
| # for exporting/importing commands | ||||
| EXPORTER_FILE_NAME = "__exported_file_name__" | ||||
| EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__" | ||||
| EXPORTER_ARCHIVE_NAME = "__exported_archive_name__" | ||||
|   | ||||
| @@ -13,7 +13,7 @@ from rest_framework.reverse import reverse | ||||
|  | ||||
| from .. import index, matching | ||||
| from ..file_handling import delete_empty_directories, generate_filename, \ | ||||
|     create_source_path_directory | ||||
|     create_source_path_directory, archive_name_from_filename | ||||
| from ..models import Document, Tag | ||||
|  | ||||
|  | ||||
| @@ -169,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs): | ||||
|  | ||||
| @receiver(models.signals.post_delete, sender=Document) | ||||
| def cleanup_document_deletion(sender, instance, using, **kwargs): | ||||
|     for f in (instance.source_path, instance.thumbnail_path): | ||||
|     for f in (instance.source_path, | ||||
|               instance.archive_path, | ||||
|               instance.thumbnail_path): | ||||
|         if os.path.isfile(f): | ||||
|             try: | ||||
|                 os.unlink(f) | ||||
|         except FileNotFoundError: | ||||
|             pass  # The file's already gone, so we're cool with it. | ||||
|                 logging.getLogger(__name__).debug( | ||||
|                     f"Deleted file {f}.") | ||||
|             except OSError as e: | ||||
|                 logging.getLogger(__name__).warning( | ||||
|                     f"While deleting document {instance.file_name}, the file " | ||||
|                     f"{f} could not be deleted: {e}" | ||||
|                 ) | ||||
|  | ||||
|     delete_empty_directories(os.path.dirname(instance.source_path)) | ||||
|     delete_empty_directories( | ||||
|         os.path.dirname(instance.source_path), | ||||
|         root=settings.ORIGINALS_DIR | ||||
|     ) | ||||
|  | ||||
|     delete_empty_directories( | ||||
|         os.path.dirname(instance.archive_path), | ||||
|         root=settings.ARCHIVE_DIR | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def validate_move(instance, old_path, new_path): | ||||
|     if not os.path.isfile(old_path): | ||||
|         # Can't do anything if the old file does not exist anymore. | ||||
|         logging.getLogger(__name__).fatal( | ||||
|             f"Document {str(instance)}: File {old_path} has gone.") | ||||
|         return False | ||||
|  | ||||
|     if os.path.isfile(new_path): | ||||
|         # Can't do anything if the new file already exists. Skip updating file. | ||||
|         logging.getLogger(__name__).warning( | ||||
|             f"Document {str(instance)}: Cannot rename file " | ||||
|             f"since target path {new_path} already exists.") | ||||
|         return False | ||||
|  | ||||
|     return True | ||||
|  | ||||
|  | ||||
| @receiver(models.signals.m2m_changed, sender=Document.tags.through) | ||||
| @@ -183,55 +216,90 @@ def cleanup_document_deletion(sender, instance, using, **kwargs): | ||||
| def update_filename_and_move_files(sender, instance, **kwargs): | ||||
|  | ||||
|     if not instance.filename: | ||||
|         # Can't update the filename if there is not filename to begin with | ||||
|         # This happens after the consumer creates a new document. | ||||
|         # The PK needs to be set first by saving the document once. When this | ||||
|         # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be | ||||
|         # renamed anyway. In all other cases, instance.filename will be set. | ||||
|         # Can't update the filename if there is no filename to begin with | ||||
|         # This happens when the consumer creates a new document. | ||||
|         # The document is modified and saved multiple times, and only after | ||||
|         # everything is done (i.e., the generated filename is final), | ||||
|         # filename will be set to the location where the consumer has put | ||||
|         # the file. | ||||
|         # | ||||
|         # This will in turn cause this logic to move the file where it belongs. | ||||
|         return | ||||
|  | ||||
|     old_filename = instance.filename | ||||
|     old_path = instance.source_path | ||||
|     new_filename = generate_filename(instance) | ||||
|  | ||||
|     if new_filename == instance.filename: | ||||
|         # Don't do anything if its the same. | ||||
|         return | ||||
|  | ||||
|     new_path = os.path.join(settings.ORIGINALS_DIR, new_filename) | ||||
|     old_source_path = instance.source_path | ||||
|     new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename) | ||||
|  | ||||
|     if not os.path.isfile(old_path): | ||||
|         # Can't do anything if the old file does not exist anymore. | ||||
|         logging.getLogger(__name__).fatal( | ||||
|             f"Document {str(instance)}: File {old_path} has gone.") | ||||
|     if not validate_move(instance, old_source_path, new_source_path): | ||||
|         return | ||||
|  | ||||
|     if os.path.isfile(new_path): | ||||
|         # Can't do anything if the new file already exists. Skip updating file. | ||||
|         logging.getLogger(__name__).warning( | ||||
|             f"Document {str(instance)}: Cannot rename file " | ||||
|             f"since target path {new_path} already exists.") | ||||
|     # archive files are optional, archive checksum tells us if we have one, | ||||
|     # since this is None for documents without archived files. | ||||
|     if instance.archive_checksum: | ||||
|         new_archive_filename = archive_name_from_filename(new_filename) | ||||
|         old_archive_path = instance.archive_path | ||||
|         new_archive_path = os.path.join(settings.ARCHIVE_DIR, | ||||
|                                         new_archive_filename) | ||||
|  | ||||
|         if not validate_move(instance, old_archive_path, new_archive_path): | ||||
|             return | ||||
|  | ||||
|     create_source_path_directory(new_path) | ||||
|         create_source_path_directory(new_archive_path) | ||||
|     else: | ||||
|         old_archive_path = None | ||||
|         new_archive_path = None | ||||
|  | ||||
|     create_source_path_directory(new_source_path) | ||||
|  | ||||
|     try: | ||||
|         os.rename(old_path, new_path) | ||||
|         os.rename(old_source_path, new_source_path) | ||||
|         if instance.archive_checksum: | ||||
|             os.rename(old_archive_path, new_archive_path) | ||||
|         instance.filename = new_filename | ||||
|         # Don't save here to prevent infinite recursion. | ||||
|         Document.objects.filter(pk=instance.pk).update(filename=new_filename) | ||||
|  | ||||
|         logging.getLogger(__name__).debug( | ||||
|             f"Moved file {old_path} to {new_path}.") | ||||
|             f"Moved file {old_source_path} to {new_source_path}.") | ||||
|  | ||||
|         logging.getLogger(__name__).debug( | ||||
|             f"Moved file {old_archive_path} to {new_archive_path}.") | ||||
|  | ||||
|     except OSError as e: | ||||
|         instance.filename = old_filename | ||||
|         # this happens when we can't move a file. If that's the case for the | ||||
|         # archive file, we try our best to revert the changes. | ||||
|         try: | ||||
|             os.rename(new_source_path, old_source_path) | ||||
|             os.rename(new_archive_path, old_archive_path) | ||||
|         except: | ||||
|             # This is fine, since: | ||||
|             # A: if we managed to move source from A to B, we will also manage | ||||
|             #  to move it from B to A. If not, we have a serious issue | ||||
|             #  that's going to get caught by the santiy checker. | ||||
|             #  all files remain in place and will never be overwritten, | ||||
|             #  so this is not the end of the world. | ||||
|             # B: if moving the orignal file failed, nothing has changed anyway. | ||||
|             pass | ||||
|     except DatabaseError as e: | ||||
|         os.rename(new_path, old_path) | ||||
|         os.rename(new_source_path, old_source_path) | ||||
|         if instance.archive_checksum: | ||||
|             os.rename(new_archive_path, old_archive_path) | ||||
|         instance.filename = old_filename | ||||
|  | ||||
|     if not os.path.isfile(old_path): | ||||
|         delete_empty_directories(os.path.dirname(old_path)) | ||||
|     if not os.path.isfile(old_source_path): | ||||
|         delete_empty_directories(os.path.dirname(old_source_path), | ||||
|                                  root=settings.ORIGINALS_DIR) | ||||
|  | ||||
|     if old_archive_path and not os.path.isfile(old_archive_path): | ||||
|         delete_empty_directories(os.path.dirname(old_archive_path), | ||||
|                                  root=settings.ARCHIVE_DIR) | ||||
|  | ||||
|  | ||||
| def set_log_entry(sender, document=None, logging_group=None, **kwargs): | ||||
|   | ||||
| @@ -12,7 +12,9 @@ from documents.sanity_checker import SanityFailedError | ||||
|  | ||||
|  | ||||
| def index_optimize(): | ||||
|     index.open_index().optimize() | ||||
|     ix = index.open_index() | ||||
|     writer = AsyncWriter(ix) | ||||
|     writer.commit(optimize=True) | ||||
|  | ||||
|  | ||||
| def index_reindex(): | ||||
|   | ||||
| Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/archive/0000001.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -100,6 +100,44 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content_thumbnail) | ||||
|  | ||||
|     def test_download_with_archive(self): | ||||
|  | ||||
|         _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir) | ||||
|  | ||||
|         content = b"This is a test" | ||||
|         content_archive = b"This is the same test but archived" | ||||
|  | ||||
|         with open(filename, "wb") as f: | ||||
|             f.write(content) | ||||
|  | ||||
|         filename = os.path.basename(filename) | ||||
|  | ||||
|         doc = Document.objects.create(title="none", filename=filename, | ||||
|                                       mime_type="application/pdf") | ||||
|  | ||||
|         with open(doc.archive_path, "wb") as f: | ||||
|             f.write(content_archive) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/download/'.format(doc.pk)) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content_archive) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk)) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/preview/'.format(doc.pk)) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content_archive) | ||||
|  | ||||
|         response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk)) | ||||
|  | ||||
|         self.assertEqual(response.status_code, 200) | ||||
|         self.assertEqual(response.content, content) | ||||
|  | ||||
|     def test_document_actions_not_existing_file(self): | ||||
|  | ||||
|         doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf") | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| import tempfile | ||||
| from unittest import mock | ||||
| from unittest.mock import MagicMock | ||||
| @@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase): | ||||
|  | ||||
| class DummyParser(DocumentParser): | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         # not important during tests | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def __init__(self, path, logging_group, scratch_dir): | ||||
|         super(DummyParser, self).__init__(path, logging_group) | ||||
|     def __init__(self, logging_group, scratch_dir, archive_path): | ||||
|         super(DummyParser, self).__init__(logging_group) | ||||
|         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) | ||||
|         self.archive_path = archive_path | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|     def get_optimised_thumbnail(self, document_path, mime_type): | ||||
|         return self.fake_thumb | ||||
|  | ||||
|     def get_text(self): | ||||
|         return "The Text" | ||||
|     def parse(self, document_path, mime_type): | ||||
|         self.text = "The Text" | ||||
|  | ||||
|  | ||||
| class FaultyParser(DocumentParser): | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         # not important during tests | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def __init__(self, path, logging_group, scratch_dir): | ||||
|         super(FaultyParser, self).__init__(path, logging_group) | ||||
|     def __init__(self, logging_group, scratch_dir): | ||||
|         super(FaultyParser, self).__init__(logging_group) | ||||
|         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) | ||||
|  | ||||
|     def get_optimised_thumbnail(self): | ||||
|     def get_optimised_thumbnail(self, document_path, mime_type): | ||||
|         return self.fake_thumb | ||||
|  | ||||
|     def get_text(self): | ||||
|     def parse(self, document_path, mime_type): | ||||
|         raise ParseError("Does not compute.") | ||||
|  | ||||
|  | ||||
| @@ -410,11 +412,11 @@ def fake_magic_from_file(file, mime=False): | ||||
| @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) | ||||
| class TestConsumer(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def make_dummy_parser(self, path, logging_group): | ||||
|         return DummyParser(path, logging_group, self.dirs.scratch_dir) | ||||
|     def make_dummy_parser(self, logging_group): | ||||
|         return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file()) | ||||
|  | ||||
|     def make_faulty_parser(self, path, logging_group): | ||||
|         return FaultyParser(path, logging_group, self.dirs.scratch_dir) | ||||
|     def make_faulty_parser(self, logging_group): | ||||
|         return FaultyParser(logging_group, self.dirs.scratch_dir) | ||||
|  | ||||
|     def setUp(self): | ||||
|         super(TestConsumer, self).setUp() | ||||
| @@ -432,8 +434,16 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|         self.consumer = Consumer() | ||||
|  | ||||
|     def get_test_file(self): | ||||
|         fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir) | ||||
|         return f | ||||
|         src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf") | ||||
|         dst = os.path.join(self.dirs.scratch_dir, "sample.pdf") | ||||
|         shutil.copy(src, dst) | ||||
|         return dst | ||||
|  | ||||
|     def get_test_archive_file(self): | ||||
|         src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf") | ||||
|         dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf") | ||||
|         shutil.copy(src, dst) | ||||
|         return dst | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT=None) | ||||
|     def testNormalOperation(self): | ||||
| @@ -455,6 +465,13 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|             document.thumbnail_path | ||||
|         )) | ||||
|  | ||||
|         self.assertTrue(os.path.isfile( | ||||
|             document.archive_path | ||||
|         )) | ||||
|  | ||||
|         self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1") | ||||
|         self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b") | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(filename)) | ||||
|  | ||||
|     def testOverrideFilename(self): | ||||
| @@ -502,7 +519,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     def testDuplicates(self): | ||||
|     def testDuplicates1(self): | ||||
|         self.consumer.try_consume_file(self.get_test_file()) | ||||
|  | ||||
|         try: | ||||
| @@ -513,6 +530,21 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     def testDuplicates2(self): | ||||
|         self.consumer.try_consume_file(self.get_test_file()) | ||||
|  | ||||
|         try: | ||||
|             self.consumer.try_consume_file(self.get_test_archive_file()) | ||||
|         except ConsumerError as e: | ||||
|             self.assertTrue(str(e).endswith("It is a duplicate.")) | ||||
|             return | ||||
|  | ||||
|         self.fail("Should throw exception") | ||||
|  | ||||
|     def testDuplicates3(self): | ||||
|         self.consumer.try_consume_file(self.get_test_archive_file()) | ||||
|         self.consumer.try_consume_file(self.get_test_file()) | ||||
|  | ||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||
|     def testNoParsers(self, m): | ||||
|         m.return_value = [] | ||||
|   | ||||
							
								
								
									
										140
									
								
								src/documents/tests/test_date_parsing.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,140 @@ | ||||
| import datetime | ||||
| import os | ||||
| import shutil | ||||
| from unittest import mock | ||||
| from uuid import uuid4 | ||||
|  | ||||
| from dateutil import tz | ||||
| from django.conf import settings | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.parsers import parse_date | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser | ||||
|  | ||||
|  | ||||
| class TestDate(TestCase): | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples") | ||||
|     SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) | ||||
|  | ||||
|     def setUp(self): | ||||
|         os.makedirs(self.SCRATCH, exist_ok=True) | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.SCRATCH) | ||||
|  | ||||
|     def test_date_format_1(self): | ||||
|         text = "lorem ipsum 130218 lorem ipsum" | ||||
|         self.assertEqual(parse_date("", text), None) | ||||
|  | ||||
|     def test_date_format_2(self): | ||||
|         text = "lorem ipsum 2018 lorem ipsum" | ||||
|         self.assertEqual(parse_date("", text), None) | ||||
|  | ||||
|     def test_date_format_3(self): | ||||
|         text = "lorem ipsum 20180213 lorem ipsum" | ||||
|         self.assertEqual(parse_date("", text), None) | ||||
|  | ||||
|     def test_date_format_4(self): | ||||
|         text = "lorem ipsum 13.02.2018 lorem ipsum" | ||||
|         date = parse_date("", text) | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_date_format_5(self): | ||||
|         text = ( | ||||
|             "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " | ||||
|             "ipsum" | ||||
|         ) | ||||
|         date = parse_date("", text) | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_date_format_6(self): | ||||
|         text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "Wohnort\n" | ||||
|             "3100\n" | ||||
|             "IBAN\n" | ||||
|             "AT87 4534\n" | ||||
|             "1234\n" | ||||
|             "1234 5678\n" | ||||
|             "BIC\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         self.assertEqual(parse_date("", text), None) | ||||
|  | ||||
|     def test_date_format_7(self): | ||||
|         text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "März 2019\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         date = parse_date("", text) | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2019, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_date_format_8(self): | ||||
|         text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "Wohnort\n" | ||||
|             "3100\n" | ||||
|             "IBAN\n" | ||||
|             "AT87 4534\n" | ||||
|             "1234\n" | ||||
|             "1234 5678\n" | ||||
|             "BIC\n" | ||||
|             "lorem ipsum\n" | ||||
|             "März 2020" | ||||
|         ) | ||||
|         self.assertEqual( | ||||
|             parse_date("", text), | ||||
|             datetime.datetime( | ||||
|                 2020, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_9(self): | ||||
|         text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "27. Nullmonth 2020\n" | ||||
|             "März 2020\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         self.assertEqual( | ||||
|             parse_date("", text), | ||||
|             datetime.datetime( | ||||
|                 2020, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     def test_crazy_date_past(self, *args): | ||||
|         self.assertIsNone(parse_date("", "01-07-0590 00:00:00")) | ||||
|  | ||||
|     def test_crazy_date_future(self, *args): | ||||
|         self.assertIsNone(parse_date("", "01-07-2350 00:00:00")) | ||||
|  | ||||
|     def test_crazy_date_with_spaces(self, *args): | ||||
|         self.assertIsNone(parse_date("", "20 408000l 2475")) | ||||
|  | ||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||
|     def test_filename_date_parse_invalid(self, *args): | ||||
|         self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) | ||||
| @@ -1,12 +1,29 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from pathlib import Path | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from ..models import Document, Correspondent | ||||
|  | ||||
|  | ||||
| class TestDocument(TestCase): | ||||
|  | ||||
|     def setUp(self) -> None: | ||||
|         self.originals_dir = tempfile.mkdtemp() | ||||
|         self.thumb_dir = tempfile.mkdtemp() | ||||
|  | ||||
|         override_settings( | ||||
|             ORIGINALS_DIR=self.originals_dir, | ||||
|             THUMBNAIL_DIR=self.thumb_dir, | ||||
|         ).enable() | ||||
|  | ||||
|     def tearDown(self) -> None: | ||||
|         shutil.rmtree(self.originals_dir) | ||||
|         shutil.rmtree(self.thumb_dir) | ||||
|  | ||||
|     def test_file_deletion(self): | ||||
|         document = Document.objects.create( | ||||
|             correspondent=Correspondent.objects.create(name="Test0"), | ||||
| @@ -19,6 +36,9 @@ class TestDocument(TestCase): | ||||
|         file_path = document.source_path | ||||
|         thumb_path = document.thumbnail_path | ||||
|  | ||||
|         Path(file_path).touch() | ||||
|         Path(thumb_path).touch() | ||||
|  | ||||
|         with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink: | ||||
|             document.delete() | ||||
|             mock_unlink.assert_any_call(file_path) | ||||
|   | ||||
| @@ -2,32 +2,17 @@ import os | ||||
| import shutil | ||||
| from pathlib import Path | ||||
| from unittest import mock | ||||
| from uuid import uuid4 | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.db import DatabaseError | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from .utils import DirectoriesMixin | ||||
| from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories | ||||
| from ..models import Document, Correspondent | ||||
|  | ||||
|  | ||||
| class TestDate(TestCase): | ||||
|     deletion_list = [] | ||||
|  | ||||
|     def add_to_deletion_list(self, dirname): | ||||
|         self.deletion_list.append(dirname) | ||||
|  | ||||
|     def setUp(self): | ||||
|         folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) | ||||
|         os.makedirs(folder + "/documents/originals") | ||||
|         override_settings(MEDIA_ROOT=folder).enable() | ||||
|         override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable() | ||||
|         self.add_to_deletion_list(folder) | ||||
|  | ||||
|     def tearDown(self): | ||||
|         for dirname in self.deletion_list: | ||||
|             shutil.rmtree(dirname, ignore_errors=True) | ||||
| class TestFileHandling(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="") | ||||
|     def test_generate_source_filename(self): | ||||
| @@ -104,7 +89,7 @@ class TestDate(TestCase): | ||||
|         document.save() | ||||
|  | ||||
|         # Check proper handling of files | ||||
|         self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|         self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||
|  | ||||
|         os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) | ||||
| @@ -140,7 +125,7 @@ class TestDate(TestCase): | ||||
|  | ||||
|             # Check proper handling of files | ||||
|             self.assertTrue(os.path.isfile(document.source_path)) | ||||
|             self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|             self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True) | ||||
|             self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") | ||||
| @@ -196,8 +181,8 @@ class TestDate(TestCase): | ||||
|         document.save() | ||||
|  | ||||
|         # Check proper handling of files | ||||
|         self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True) | ||||
|         self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True) | ||||
|         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True) | ||||
|         self.assertTrue(os.path.isfile(important_file)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") | ||||
| @@ -315,13 +300,12 @@ class TestDate(TestCase): | ||||
|         # Create our working directory | ||||
|         tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty") | ||||
|         os.makedirs(tmp) | ||||
|         self.add_to_deletion_list(tmp) | ||||
|  | ||||
|         os.makedirs(os.path.join(tmp, "notempty")) | ||||
|         Path(os.path.join(tmp, "notempty", "file")).touch() | ||||
|         os.makedirs(os.path.join(tmp, "notempty", "empty")) | ||||
|  | ||||
|         delete_empty_directories(os.path.join(tmp, "notempty", "empty")) | ||||
|         delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR) | ||||
|         self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) | ||||
|         self.assertEqual(os.path.isfile( | ||||
|             os.path.join(tmp, "notempty", "file")), True) | ||||
| @@ -345,3 +329,159 @@ class TestDate(TestCase): | ||||
|         document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|  | ||||
|         self.assertEqual(generate_filename(document), "0000001.pdf") | ||||
|  | ||||
|  | ||||
| class TestFileHandlingWithArchive(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT=None) | ||||
|     def test_create_no_format(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_create_with_format(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(original)) | ||||
|         self.assertFalse(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|         self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf")) | ||||
|         self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_move_archive_gone(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         #Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertFalse(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertFalse(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_move_archive_exists(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none")) | ||||
|         Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     @mock.patch("documents.signals.handlers.os.rename") | ||||
|     def test_move_archive_error(self, m): | ||||
|  | ||||
|         def fake_rename(src, dst): | ||||
|             if "archive" in src: | ||||
|                 raise OSError() | ||||
|             else: | ||||
|                 os.remove(src) | ||||
|                 Path(dst).touch() | ||||
|  | ||||
|         m.side_effect = fake_rename | ||||
|  | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_move_file_gone(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         #Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertFalse(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     @mock.patch("documents.signals.handlers.os.rename") | ||||
|     def test_move_file_error(self, m): | ||||
|  | ||||
|         def fake_rename(src, dst): | ||||
|             if "original" in src: | ||||
|                 raise OSError() | ||||
|             else: | ||||
|                 os.remove(src) | ||||
|                 Path(dst).touch() | ||||
|  | ||||
|         m.side_effect = fake_rename | ||||
|  | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     def test_archive_deleted(self): | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|         doc.delete() | ||||
|  | ||||
|         self.assertFalse(os.path.isfile(original)) | ||||
|         self.assertFalse(os.path.isfile(archive)) | ||||
|         self.assertFalse(os.path.isfile(doc.source_path)) | ||||
|         self.assertFalse(os.path.isfile(doc.archive_path)) | ||||
|  | ||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||
|     def test_database_error(self): | ||||
|  | ||||
|         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||
|         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||
|         Path(original).touch() | ||||
|         Path(archive).touch() | ||||
|         doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||
|         with mock.patch("documents.signals.handlers.Document.objects.filter") as m: | ||||
|             m.side_effect = DatabaseError() | ||||
|             doc.save() | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(original)) | ||||
|         self.assertTrue(os.path.isfile(archive)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|   | ||||
							
								
								
									
										42
									
								
								src/documents/tests/test_management_archiver.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,42 @@ | ||||
| import filecmp | ||||
| import os | ||||
| import shutil | ||||
|  | ||||
| from django.core.management import call_command | ||||
| from django.test import TestCase | ||||
|  | ||||
| from documents.management.commands.document_archiver import handle_document | ||||
| from documents.models import Document | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
|  | ||||
|  | ||||
| sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") | ||||
|  | ||||
|  | ||||
| class TestArchiver(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def make_models(self): | ||||
|         self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf") | ||||
|         #self.d2 = Document.objects.create(checksum="B", title="B", content="second document") | ||||
|         #self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document") | ||||
|  | ||||
|     def test_archiver(self): | ||||
|  | ||||
|         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||
|         self.make_models() | ||||
|  | ||||
|         call_command('document_archiver') | ||||
|  | ||||
|     def test_handle_document(self): | ||||
|  | ||||
|         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||
|         self.make_models() | ||||
|  | ||||
|         handle_document(self.d1) | ||||
|  | ||||
|         doc = Document.objects.get(id=self.d1.id) | ||||
|  | ||||
|         self.assertIsNotNone(doc.checksum) | ||||
|         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||
|         self.assertTrue(os.path.isfile(doc.source_path)) | ||||
|         self.assertTrue(filecmp.cmp(sample_file, doc.source_path)) | ||||
| @@ -23,10 +23,7 @@ class TestExporter(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         file = os.path.join(self.dirs.originals_dir, "0000001.pdf") | ||||
|  | ||||
|         with open(file, "rb") as f: | ||||
|             checksum = hashlib.md5(f.read()).hexdigest() | ||||
|  | ||||
|         Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf") | ||||
|         Document.objects.create(checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf") | ||||
|         Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) | ||||
|         Tag.objects.create(name="t") | ||||
|         DocumentType.objects.create(name="dt") | ||||
| @@ -51,6 +48,14 @@ class TestExporter(DirectoriesMixin, TestCase): | ||||
|                     checksum = hashlib.md5(f.read()).hexdigest() | ||||
|                 self.assertEqual(checksum, element['fields']['checksum']) | ||||
|  | ||||
|                 if document_exporter.EXPORTER_ARCHIVE_NAME in element: | ||||
|                     fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME]) | ||||
|                     self.assertTrue(os.path.exists(fname)) | ||||
|  | ||||
|                     with open(fname, "rb") as f: | ||||
|                         checksum = hashlib.md5(f.read()).hexdigest() | ||||
|                     self.assertEqual(checksum, element['fields']['archive_checksum']) | ||||
|  | ||||
|         Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf") | ||||
|  | ||||
|         self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target) | ||||
|   | ||||
| @@ -1,11 +1,13 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from tempfile import TemporaryDirectory | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \ | ||||
|     get_parser_class_for_mime_type | ||||
|     get_parser_class_for_mime_type, DocumentParser | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser | ||||
| from paperless_text.parsers import TextDocumentParser | ||||
|  | ||||
| @@ -66,6 +68,38 @@ class TestParserDiscovery(TestCase): | ||||
|             ) | ||||
|  | ||||
|  | ||||
| def fake_get_thumbnail(self, path, mimetype): | ||||
|     return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") | ||||
|  | ||||
|  | ||||
| class TestBaseParser(TestCase): | ||||
|  | ||||
|     def setUp(self) -> None: | ||||
|  | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|         override_settings( | ||||
|             SCRATCH_DIR=self.scratch | ||||
|         ).enable() | ||||
|  | ||||
|     def tearDown(self) -> None: | ||||
|         shutil.rmtree(self.scratch) | ||||
|  | ||||
|     @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) | ||||
|     @override_settings(OPTIMIZE_THUMBNAILS=True) | ||||
|     def test_get_optimised_thumbnail(self): | ||||
|         parser = DocumentParser(None) | ||||
|  | ||||
|         parser.get_optimised_thumbnail("any", "not important") | ||||
|  | ||||
|     @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) | ||||
|     @override_settings(OPTIMIZE_THUMBNAILS=False) | ||||
|     def test_get_optimised_thumb_disabled(self): | ||||
|         parser = DocumentParser(None) | ||||
|  | ||||
|         path = parser.get_optimised_thumbnail("any", "not important") | ||||
|         self.assertEqual(path, fake_get_thumbnail(None, None, None)) | ||||
|  | ||||
|  | ||||
| class TestParserAvailability(TestCase): | ||||
|  | ||||
|     def test_file_extensions(self): | ||||
|   | ||||
| @@ -17,10 +17,12 @@ def setup_directories(): | ||||
|     dirs.index_dir = os.path.join(dirs.data_dir, "index") | ||||
|     dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals") | ||||
|     dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails") | ||||
|     dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive") | ||||
|  | ||||
|     os.makedirs(dirs.index_dir, exist_ok=True) | ||||
|     os.makedirs(dirs.originals_dir, exist_ok=True) | ||||
|     os.makedirs(dirs.thumbnail_dir, exist_ok=True) | ||||
|     os.makedirs(dirs.archive_dir, exist_ok=True) | ||||
|  | ||||
|     override_settings( | ||||
|         DATA_DIR=dirs.data_dir, | ||||
| @@ -28,6 +30,7 @@ def setup_directories(): | ||||
|         MEDIA_ROOT=dirs.media_dir, | ||||
|         ORIGINALS_DIR=dirs.originals_dir, | ||||
|         THUMBNAIL_DIR=dirs.thumbnail_dir, | ||||
|         ARCHIVE_DIR=dirs.archive_dir, | ||||
|         CONSUMPTION_DIR=dirs.consumption_dir, | ||||
|         INDEX_DIR=dirs.index_dir, | ||||
|         MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle") | ||||
|   | ||||
| @@ -1,3 +1,5 @@ | ||||
| import os | ||||
|  | ||||
| from django.db.models import Count, Max | ||||
| from django.http import HttpResponse, HttpResponseBadRequest, Http404 | ||||
| from django.views.decorators.cache import cache_control | ||||
| @@ -126,17 +128,30 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|         index.remove_document_from_index(self.get_object()) | ||||
|         return super(DocumentViewSet, self).destroy(request, *args, **kwargs) | ||||
|  | ||||
|     def file_response(self, pk, disposition): | ||||
|     @staticmethod | ||||
|     def original_requested(request): | ||||
|         return ( | ||||
|             'original' in request.query_params and | ||||
|             request.query_params['original'] == 'true' | ||||
|         ) | ||||
|  | ||||
|     def file_response(self, pk, request, disposition): | ||||
|         doc = Document.objects.get(id=pk) | ||||
|  | ||||
|         if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: | ||||
|             file_handle = doc.source_file | ||||
|         if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501 | ||||
|             file_handle = doc.archive_file | ||||
|             filename = doc.archive_file_name | ||||
|             mime_type = 'application/pdf' | ||||
|         else: | ||||
|             file_handle = GnuPG.decrypted(doc.source_file) | ||||
|             file_handle = doc.source_file | ||||
|             filename = doc.file_name | ||||
|             mime_type = doc.mime_type | ||||
|  | ||||
|         response = HttpResponse(file_handle, content_type=doc.mime_type) | ||||
|         if doc.storage_type == Document.STORAGE_TYPE_GPG: | ||||
|             file_handle = GnuPG.decrypted(file_handle) | ||||
|  | ||||
|         response = HttpResponse(file_handle, content_type=mime_type) | ||||
|         response["Content-Disposition"] = '{}; filename="{}"'.format( | ||||
|             disposition, doc.file_name) | ||||
|             disposition, filename) | ||||
|         return response | ||||
|  | ||||
|     @action(methods=['post'], detail=False) | ||||
| @@ -157,6 +172,8 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|                 "paperless__checksum": doc.checksum, | ||||
|                 "paperless__mime_type": doc.mime_type, | ||||
|                 "paperless__filename": doc.filename, | ||||
|                 "paperless__has_archive_version": | ||||
|                     os.path.isfile(doc.archive_path) | ||||
|             }) | ||||
|         except Document.DoesNotExist: | ||||
|             raise Http404() | ||||
| @@ -164,7 +181,8 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def preview(self, request, pk=None): | ||||
|         try: | ||||
|             response = self.file_response(pk, "inline") | ||||
|             response = self.file_response( | ||||
|                 pk, request, "inline") | ||||
|             return response | ||||
|         except (FileNotFoundError, Document.DoesNotExist): | ||||
|             raise Http404() | ||||
| @@ -181,7 +199,8 @@ class DocumentViewSet(RetrieveModelMixin, | ||||
|     @action(methods=['get'], detail=True) | ||||
|     def download(self, request, pk=None): | ||||
|         try: | ||||
|             return self.file_response(pk, "attachment") | ||||
|             return self.file_response( | ||||
|                 pk, request, "attachment") | ||||
|         except (FileNotFoundError, Document.DoesNotExist): | ||||
|             raise Http404() | ||||
|  | ||||
|   | ||||
| @@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs): | ||||
|     binaries = ( | ||||
|         settings.CONVERT_BINARY, | ||||
|         settings.OPTIPNG_BINARY, | ||||
|         settings.UNPAPER_BINARY, | ||||
|         "tesseract" | ||||
|     ) | ||||
|  | ||||
|   | ||||
| @@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta | ||||
|  | ||||
| MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media")) | ||||
| ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals") | ||||
| ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive") | ||||
| THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") | ||||
|  | ||||
| DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) | ||||
| @@ -348,9 +349,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) | ||||
| # documents.  It should be a 3-letter language code consistent with ISO 639. | ||||
| OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | ||||
|  | ||||
| # OCRmyPDF --output-type options are available. | ||||
| # TODO: validate this setting. | ||||
| OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") | ||||
|  | ||||
| # OCR all documents? | ||||
| OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false") | ||||
| # skip. redo, force | ||||
| # TODO: validate this. | ||||
| OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | ||||
|  | ||||
| OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | ||||
|  | ||||
| OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") | ||||
|  | ||||
| # GNUPG needs a home directory for some reason | ||||
| GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||
| @@ -359,11 +368,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||
| CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert") | ||||
| CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") | ||||
| CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") | ||||
| CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300)) | ||||
|  | ||||
| GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") | ||||
|  | ||||
| OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") | ||||
| UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") | ||||
|  | ||||
|  | ||||
| # Pre-2.x versions of Paperless stored your documents locally with GPG | ||||
|   | ||||
| @@ -14,12 +14,21 @@ def get_tesseract_langs(): | ||||
|  | ||||
| @register() | ||||
| def check_default_language_available(app_configs, **kwargs): | ||||
|     langs = get_tesseract_langs() | ||||
|     installed_langs = get_tesseract_langs() | ||||
|  | ||||
|     if settings.OCR_LANGUAGE not in langs: | ||||
|     if not settings.OCR_LANGUAGE: | ||||
|         return [Warning( | ||||
|             "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " | ||||
|             "This means that tesseract will fallback to english." | ||||
|         )] | ||||
|  | ||||
|     specified_langs = settings.OCR_LANGUAGE.split("+") | ||||
|  | ||||
|     for lang in specified_langs: | ||||
|         if lang not in installed_langs: | ||||
|             return [Error( | ||||
|             f"The default ocr language {settings.OCR_LANGUAGE} is " | ||||
|                 f"The selected ocr language {lang} is " | ||||
|                 f"not installed. Paperless cannot OCR your documents " | ||||
|                 f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")] | ||||
|     else: | ||||
|  | ||||
|     return [] | ||||
|   | ||||
| @@ -1,23 +1,15 @@ | ||||
| import itertools | ||||
| import json | ||||
| import os | ||||
| import re | ||||
| import subprocess | ||||
| from multiprocessing.pool import ThreadPool | ||||
|  | ||||
| import langdetect | ||||
| import ocrmypdf | ||||
| import pdftotext | ||||
| import pyocr | ||||
| from PIL import Image | ||||
| from django.conf import settings | ||||
| from pyocr import PyocrException | ||||
| from ocrmypdf import InputFileError | ||||
|  | ||||
| from documents.parsers import DocumentParser, ParseError, run_unpaper, \ | ||||
|     run_convert | ||||
| from .languages import ISO639 | ||||
|  | ||||
|  | ||||
| class OCRError(Exception): | ||||
|     pass | ||||
| from documents.parsers import DocumentParser, ParseError, run_convert | ||||
|  | ||||
|  | ||||
| class RasterisedDocumentParser(DocumentParser): | ||||
| @@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path, logging_group): | ||||
|         super().__init__(path, logging_group) | ||||
|         self._text = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|         """ | ||||
| @@ -44,7 +32,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                         alpha="remove", | ||||
|                         strip=True, | ||||
|                         trim=True, | ||||
|                         input_file="{}[0]".format(self.document_path), | ||||
|                         input_file="{}[0]".format(document_path), | ||||
|                         output_file=out_path, | ||||
|                         logging_group=self.logging_group) | ||||
|         except ParseError: | ||||
| @@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                    "-q", | ||||
|                    "-sDEVICE=pngalpha", | ||||
|                    "-o", gs_out_path, | ||||
|                    self.document_path] | ||||
|                    document_path] | ||||
|             if not subprocess.Popen(cmd).wait() == 0: | ||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||
|             # then run convert on the output from gs | ||||
| @@ -74,169 +62,126 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         return out_path | ||||
|  | ||||
|     def _is_ocred(self): | ||||
|  | ||||
|         # Extract text from PDF using pdftotext | ||||
|         text = get_text_from_pdf(self.document_path) | ||||
|  | ||||
|         # We assume, that a PDF with at least 50 characters contains text | ||||
|         # (so no OCR required) | ||||
|         return len(text) > 50 | ||||
|  | ||||
|     def get_text(self): | ||||
|  | ||||
|         if self._text is not None: | ||||
|             return self._text | ||||
|  | ||||
|         if not settings.OCR_ALWAYS and self._is_ocred(): | ||||
|             self.log("debug", "Skipping OCR, using Text from PDF") | ||||
|             self._text = get_text_from_pdf(self.document_path) | ||||
|             return self._text | ||||
|  | ||||
|         images = self._get_greyscale() | ||||
|  | ||||
|         if not images: | ||||
|             raise ParseError("Empty document, nothing to do.") | ||||
|     def is_image(self, mime_type): | ||||
|         return mime_type in [ | ||||
|             "image/png", | ||||
|             "image/jpeg" | ||||
|         ] | ||||
|  | ||||
|     def get_dpi(self, image): | ||||
|         try: | ||||
|  | ||||
|             sample_page_index = int(len(images) / 2) | ||||
|             self.log( | ||||
|                 "debug", | ||||
|                 f"Attempting language detection on page " | ||||
|                 f"{sample_page_index + 1} of {len(images)}...") | ||||
|  | ||||
|             sample_page_text = self._ocr([images[sample_page_index]], | ||||
|                                          settings.OCR_LANGUAGE)[0] | ||||
|             guessed_language = self._guess_language(sample_page_text) | ||||
|  | ||||
|             if not guessed_language or guessed_language not in ISO639: | ||||
|                 self.log("warning", "Language detection failed.") | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             elif ISO639[guessed_language] == settings.OCR_LANGUAGE: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                     f"Detected language: {guessed_language} " | ||||
|                     f"(default language)") | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501 | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     f"Detected language {guessed_language} is not available " | ||||
|                     f"on this system.") | ||||
|                 ocr_pages = self._complete_ocr_default_language( | ||||
|                     images, sample_page_index, sample_page_text) | ||||
|  | ||||
|             else: | ||||
|                 self.log("debug", f"Detected language: {guessed_language}") | ||||
|                 ocr_pages = self._ocr(images, ISO639[guessed_language]) | ||||
|  | ||||
|             self.log("debug", "OCR completed.") | ||||
|             self._text = strip_excess_whitespace(" ".join(ocr_pages)) | ||||
|             return self._text | ||||
|  | ||||
|         except OCRError as e: | ||||
|             raise ParseError(e) | ||||
|  | ||||
|     def _get_greyscale(self): | ||||
|         """ | ||||
|         Greyscale images are easier for Tesseract to OCR | ||||
|         """ | ||||
|  | ||||
|         # Convert PDF to multiple PNMs | ||||
|         input_file = self.document_path | ||||
|  | ||||
|         if settings.OCR_PAGES == 1: | ||||
|             input_file += "[0]" | ||||
|         elif settings.OCR_PAGES > 1: | ||||
|             input_file += f"[0-{settings.OCR_PAGES - 1}]" | ||||
|  | ||||
|         self.log( | ||||
|             "debug", | ||||
|             f"Converting document {input_file} into greyscale images") | ||||
|  | ||||
|         output_files = os.path.join(self.tempdir, "convert-%04d.pnm") | ||||
|  | ||||
|         run_convert(density=settings.CONVERT_DENSITY, | ||||
|                     depth="8", | ||||
|                     type="grayscale", | ||||
|                     input_file=input_file, | ||||
|                     output_file=output_files, | ||||
|                     logging_group=self.logging_group) | ||||
|  | ||||
|         # Get a list of converted images | ||||
|         pnms = [] | ||||
|         for f in os.listdir(self.tempdir): | ||||
|             if f.endswith(".pnm"): | ||||
|                 pnms.append(os.path.join(self.tempdir, f)) | ||||
|  | ||||
|         self.log("debug", f"Running unpaper on {len(pnms)} pages...") | ||||
|  | ||||
|         # Run unpaper in parallel on converted images | ||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: | ||||
|             pnms = pool.map(run_unpaper, pnms) | ||||
|  | ||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) | ||||
|  | ||||
|     def _guess_language(self, text): | ||||
|         try: | ||||
|             guess = langdetect.detect(text) | ||||
|             return guess | ||||
|             with Image.open(image) as im: | ||||
|                 x, y = im.info['dpi'] | ||||
|                 return x | ||||
|         except Exception as e: | ||||
|             self.log('warning', f"Language detection failed with: {e}") | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Error while getting DPI from image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def _ocr(self, imgs, lang): | ||||
|     def parse(self, document_path, mime_type): | ||||
|         if settings.OCR_MODE == "skip_noarchive": | ||||
|             text = get_text_from_pdf(document_path) | ||||
|             if text and len(text) > 50: | ||||
|                 self.text = text | ||||
|                 return | ||||
|  | ||||
|         archive_path = os.path.join(self.tempdir, "archive.pdf") | ||||
|  | ||||
|         ocr_args = { | ||||
|             'input_file': document_path, | ||||
|             'output_file': archive_path, | ||||
|             'use_threads': True, | ||||
|             'jobs': settings.THREADS_PER_WORKER, | ||||
|             'language': settings.OCR_LANGUAGE, | ||||
|             'output_type': settings.OCR_OUTPUT_TYPE, | ||||
|             'progress_bar': False, | ||||
|             'clean': True | ||||
|         } | ||||
|  | ||||
|         if settings.OCR_PAGES > 0: | ||||
|             ocr_args['pages'] = f"1-{settings.OCR_PAGES}" | ||||
|  | ||||
|         if settings.OCR_MODE in ['skip', 'skip_noarchive']: | ||||
|             ocr_args['skip_text'] = True | ||||
|         elif settings.OCR_MODE == 'redo': | ||||
|             ocr_args['redo_ocr'] = True | ||||
|         elif settings.OCR_MODE == 'force': | ||||
|             ocr_args['force_ocr'] = True | ||||
|  | ||||
|         if self.is_image(mime_type): | ||||
|             dpi = self.get_dpi(document_path) | ||||
|             if dpi: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|             f"Performing OCR on {len(imgs)} page(s) with language {lang}") | ||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: | ||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) | ||||
|             return r | ||||
|  | ||||
|     def _complete_ocr_default_language(self, | ||||
|                                        images, | ||||
|                                        sample_page_index, | ||||
|                                        sample_page): | ||||
|         images_copy = list(images) | ||||
|         del images_copy[sample_page_index] | ||||
|         if images_copy: | ||||
|             self.log('debug', "Continuing ocr with default language.") | ||||
|             ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) | ||||
|             ocr_pages.insert(sample_page_index, sample_page) | ||||
|             return ocr_pages | ||||
|                     f"Detected DPI for image {document_path}: {dpi}" | ||||
|                 ) | ||||
|                 ocr_args['image_dpi'] = dpi | ||||
|             elif settings.OCR_IMAGE_DPI: | ||||
|                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||
|             else: | ||||
|             return [sample_page] | ||||
|                 raise ParseError( | ||||
|                     f"Cannot produce archive PDF for image {document_path}, " | ||||
|                     f"no DPI information is present in this image and " | ||||
|                     f"OCR_IMAGE_DPI is not set.") | ||||
|  | ||||
|         if settings.OCR_USER_ARGS: | ||||
|             try: | ||||
|                 user_args = json.loads(settings.OCR_USER_ARGS) | ||||
|                 ocr_args = {**ocr_args, **user_args} | ||||
|             except Exception as e: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " | ||||
|                     f"they will not be used: {e}") | ||||
|  | ||||
|         # This forces tesseract to use one core per page. | ||||
|         os.environ['OMP_THREAD_LIMIT'] = "1" | ||||
|  | ||||
|         try: | ||||
|             self.log("debug", | ||||
|                      f"Calling OCRmyPDF with {str(ocr_args)}") | ||||
|             ocrmypdf.ocr(**ocr_args) | ||||
|             # success! announce results | ||||
|             self.archive_path = archive_path | ||||
|             self.text = get_text_from_pdf(archive_path) | ||||
|  | ||||
|         except InputFileError as e: | ||||
|             # This happens with some PDFs when used with the redo_ocr option. | ||||
|             # This is not the end of the world, we'll just use what we already | ||||
|             # have in the document. | ||||
|             self.text = get_text_from_pdf(document_path) | ||||
|             # Also, no archived file. | ||||
|             if not self.text: | ||||
|                 # However, if we don't have anything, fail: | ||||
|                 raise ParseError(e) | ||||
|  | ||||
|         except Exception as e: | ||||
|             # Anything else is probably serious. | ||||
|             raise ParseError(e) | ||||
|  | ||||
|         if not self.text: | ||||
|             # This may happen for files that don't have any text. | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Document {document_path} does not have any text." | ||||
|                 f"This is probably an error or you tried to add an image " | ||||
|                 f"without text.") | ||||
|             self.text = "" | ||||
|  | ||||
|  | ||||
| def strip_excess_whitespace(text): | ||||
|     if not text: | ||||
|         return None | ||||
|  | ||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||
|     no_leading_whitespace = re.sub( | ||||
|         r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||
|     no_trailing_whitespace = re.sub( | ||||
|         r"([^\S\n\r]+)$", '', no_leading_whitespace) | ||||
|     return no_trailing_whitespace | ||||
|  | ||||
|  | ||||
| def image_to_string(args): | ||||
|     img, lang = args | ||||
|     ocr = pyocr.get_available_tools()[0] | ||||
|     with Image.open(img) as f: | ||||
|         if ocr.can_detect_orientation(): | ||||
|             try: | ||||
|                 orientation = ocr.detect_orientation(f, lang=lang) | ||||
|                 f = f.rotate(orientation["angle"], expand=1) | ||||
|             except Exception: | ||||
|                 # Rotation not possible, ignore | ||||
|                 pass | ||||
|         try: | ||||
|             return ocr.image_to_string(f, lang=lang) | ||||
|         except PyocrException as e: | ||||
|             raise OCRError(e) | ||||
|     # TODO: this needs a rework | ||||
|     return no_trailing_whitespace.strip() | ||||
|  | ||||
|  | ||||
| def get_text_from_pdf(pdf_file): | ||||
| @@ -245,6 +190,9 @@ def get_text_from_pdf(pdf_file): | ||||
|         try: | ||||
|             pdf = pdftotext.PDF(f) | ||||
|         except pdftotext.Error: | ||||
|             return "" | ||||
|             # might not be a PDF file | ||||
|             return None | ||||
|  | ||||
|     return "\n".join(pdf) | ||||
|     text = "\n".join(pdf) | ||||
|  | ||||
|     return strip_excess_whitespace(text) | ||||
|   | ||||
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/multi-page-digital.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/multi-page-images.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/no-text-alpha.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 32 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple-alpha.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 8.2 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple-no-dpi.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 6.8 KiB | 
| Before Width: | Height: | Size: 7.7 KiB After Width: | Height: | Size: 7.2 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/with-form.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -1,193 +0,0 @@ | ||||
| import datetime | ||||
| import os | ||||
| import shutil | ||||
| from unittest import mock | ||||
| from uuid import uuid4 | ||||
|  | ||||
| from dateutil import tz | ||||
| from django.conf import settings | ||||
| from django.test import TestCase, override_settings | ||||
|  | ||||
| from ..parsers import RasterisedDocumentParser | ||||
|  | ||||
|  | ||||
| class TestDate(TestCase): | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) | ||||
|  | ||||
|     def setUp(self): | ||||
|         os.makedirs(self.SCRATCH, exist_ok=True) | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.SCRATCH) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_1(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = "lorem ipsum 130218 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), None) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_2(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = "lorem ipsum 2018 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), None) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_3(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = "lorem ipsum 20180213 lorem ipsum" | ||||
|         self.assertEqual(document.get_date(), None) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_4(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = "lorem ipsum 13.02.2018 lorem ipsum" | ||||
|         date = document.get_date() | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_5(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " | ||||
|             "ipsum" | ||||
|         ) | ||||
|         date = document.get_date() | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2018, 2, 13, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_6(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "Wohnort\n" | ||||
|             "3100\n" | ||||
|             "IBAN\n" | ||||
|             "AT87 4534\n" | ||||
|             "1234\n" | ||||
|             "1234 5678\n" | ||||
|             "BIC\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         self.assertEqual(document.get_date(), None) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_7(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "März 2019\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         date = document.get_date() | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             datetime.datetime( | ||||
|                 2019, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_8(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "Wohnort\n" | ||||
|             "3100\n" | ||||
|             "IBAN\n" | ||||
|             "AT87 4534\n" | ||||
|             "1234\n" | ||||
|             "1234 5678\n" | ||||
|             "BIC\n" | ||||
|             "lorem ipsum\n" | ||||
|             "März 2020" | ||||
|         ) | ||||
|         self.assertEqual( | ||||
|             document.get_date(), | ||||
|             datetime.datetime( | ||||
|                 2020, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_date_format_9(self): | ||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") | ||||
|         document = RasterisedDocumentParser(input_file, None) | ||||
|         document._text = ( | ||||
|             "lorem ipsum\n" | ||||
|             "27. Nullmonth 2020\n" | ||||
|             "März 2020\n" | ||||
|             "lorem ipsum" | ||||
|         ) | ||||
|         self.assertEqual( | ||||
|             document.get_date(), | ||||
|             datetime.datetime( | ||||
|                 2020, 3, 1, 0, 0, | ||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", | ||||
|         return_value="01-07-0590 00:00:00" | ||||
|     ) | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_crazy_date_past(self, *args): | ||||
|         document = RasterisedDocumentParser("/dev/null", None) | ||||
|         document.get_text() | ||||
|         self.assertIsNone(document.get_date()) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", | ||||
|         return_value="01-07-2350 00:00:00" | ||||
|     ) | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_crazy_date_future(self, *args): | ||||
|         document = RasterisedDocumentParser("/dev/null", None) | ||||
|         document.get_text() | ||||
|         self.assertIsNone(document.get_date()) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", | ||||
|         return_value="20 408000l 2475" | ||||
|     ) | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_crazy_date_with_spaces(self, *args): | ||||
|         document = RasterisedDocumentParser("/dev/null", None) | ||||
|         document.get_text() | ||||
|         self.assertIsNone(document.get_date()) | ||||
|  | ||||
|     @mock.patch( | ||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", | ||||
|         return_value="No date in here" | ||||
|     ) | ||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||
|     @override_settings(SCRATCH_DIR=SCRATCH) | ||||
|     def test_filename_date_parse_invalid(self, *args): | ||||
|         document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None) | ||||
|         document.get_text() | ||||
|         self.assertIsNone(document.get_date()) | ||||
| @@ -1,76 +0,0 @@ | ||||
| import os | ||||
| from unittest import mock, skipIf | ||||
|  | ||||
| import pyocr | ||||
| from django.test import TestCase | ||||
| from pyocr.libtesseract.tesseract_raw import \ | ||||
|     TesseractError as OtherTesseractError | ||||
|  | ||||
| from ..parsers import image_to_string, strip_excess_whitespace | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise OtherTesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         return "This is test text" | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| class TestOCR(TestCase): | ||||
|  | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) | ||||
|  | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") | ||||
|     @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||
|     def test_image_to_string_with_text_free_page(self): | ||||
|         """ | ||||
|         This test is sort of silly, since it's really just reproducing an odd | ||||
|         exception thrown by pyocr when it encounters a page with no text. | ||||
|         Actually running this test against an installation of Tesseract results | ||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I | ||||
|         don't care to dig.  Regardless, if you run the consumer normally, | ||||
|         text-free pages are now handled correctly so long as we work around | ||||
|         this weird exception. | ||||
|         """ | ||||
|         image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"]) | ||||
| @@ -1,46 +1,17 @@ | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| import uuid | ||||
| from typing import ContextManager | ||||
| from unittest import mock | ||||
|  | ||||
| from django.test import TestCase, override_settings | ||||
| from pyocr.error import TesseractError | ||||
|  | ||||
| from documents.parsers import ParseError, run_convert | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError | ||||
| from documents.tests.utils import DirectoriesMixin | ||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace | ||||
|  | ||||
| image_to_string_calls = [] | ||||
|  | ||||
|  | ||||
| class FakeTesseract(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def can_detect_orientation(): | ||||
|         return True | ||||
|  | ||||
|     @staticmethod | ||||
|     def detect_orientation(file_handle, lang): | ||||
|         raise TesseractError("arbitrary status", "message") | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_languages(): | ||||
|         return ['eng', 'deu'] | ||||
|  | ||||
|     @staticmethod | ||||
|     def image_to_string(file_handle, lang): | ||||
|         image_to_string_calls.append((file_handle.name, lang)) | ||||
|         return file_handle.read() | ||||
|  | ||||
|  | ||||
| class FakePyOcr(object): | ||||
|  | ||||
|     @staticmethod | ||||
|     def get_available_tools(): | ||||
|         return [FakeTesseract] | ||||
|  | ||||
|  | ||||
| def fake_convert(input_file, output_file, **kwargs): | ||||
|     with open(input_file) as f: | ||||
|         lines = f.readlines() | ||||
| @@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs): | ||||
|             f2.write(line.strip()) | ||||
|  | ||||
|  | ||||
| def fake_unpaper(pnm): | ||||
|     output = pnm + ".unpaper.pnm" | ||||
|     shutil.copy(pnm, output) | ||||
|     return output | ||||
|  | ||||
|  | ||||
| class FakeImageFile(ContextManager): | ||||
|     def __init__(self, fname): | ||||
|         self.fname = fname | ||||
| @@ -67,142 +32,50 @@ class FakeImageFile(ContextManager): | ||||
|         return os.path.basename(self.fname) | ||||
|  | ||||
|  | ||||
| fake_image = FakeImageFile | ||||
|  | ||||
|  | ||||
| @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | ||||
| @mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) | ||||
| @mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) | ||||
| @mock.patch("paperless_tesseract.parsers.Image.open", open) | ||||
| class TestRasterisedDocumentParser(TestCase): | ||||
| class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|     def assertContainsStrings(self, content, strings): | ||||
|         # Asserts that all strings appear in content, in the given order. | ||||
|         indices = [content.index(s) for s in strings] | ||||
|         self.assertListEqual(indices, sorted(indices)) | ||||
|  | ||||
|         global image_to_string_calls | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|     ] | ||||
|  | ||||
|         image_to_string_calls = [] | ||||
|  | ||||
|         override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch) | ||||
|  | ||||
|     def get_input_file(self, pages): | ||||
|         _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) | ||||
|         with open(fname, "w") as f: | ||||
|             f.writelines([f"line {p}\n" for p in range(pages)]) | ||||
|         return fname | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_simple_language_match(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_2_pages(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") | ||||
|     def test_parse_text_3_pages(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) | ||||
|     def test_parse_text_lang_detect_failed(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") | ||||
|     def test_parse_text_lang_not_installed(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2 line 3") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") | ||||
|     def test_parse_text_lang_mismatch(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) | ||||
|         text = parser.get_text() | ||||
|         self.assertEqual(text, "line 0 line 1 line 2") | ||||
|  | ||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") | ||||
|     def test_parse_empty_doc(self): | ||||
|         parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) | ||||
|         try: | ||||
|             parser.get_text() | ||||
|         except ParseError as e: | ||||
|             self.assertEqual("Empty document, nothing to do.", str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|  | ||||
| class TestAuxilliaryFunctions(TestCase): | ||||
|  | ||||
|     def setUp(self): | ||||
|         self.scratch = tempfile.mkdtemp() | ||||
|  | ||||
|         override_settings(SCRATCH_DIR=self.scratch).enable() | ||||
|  | ||||
|     def tearDown(self): | ||||
|         shutil.rmtree(self.scratch) | ||||
|     def test_strip_excess_whitespace(self): | ||||
|         for source, result in self.text_cases: | ||||
|             actual_result = strip_excess_whitespace(source) | ||||
|             self.assertEqual( | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|  | ||||
|     def test_get_text_from_pdf(self): | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf')) | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf')) | ||||
|  | ||||
|         self.assertEqual(text.strip(), "This is a test document.") | ||||
|  | ||||
|     def test_get_text_from_pdf_error(self): | ||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) | ||||
|  | ||||
|         self.assertEqual(text.strip(), "") | ||||
|  | ||||
|     def test_image_to_string(self): | ||||
|         text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) | ||||
|  | ||||
|         self.assertEqual(text, "This is a test document.") | ||||
|  | ||||
|     def test_image_to_string_language_unavailable(self): | ||||
|         try: | ||||
|             image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) | ||||
|         except OCRError as e: | ||||
|             self.assertTrue("Failed loading language" in str(e)) | ||||
|         else: | ||||
|             self.fail("Should raise exception") | ||||
|  | ||||
|     @override_settings(OCR_ALWAYS=False) | ||||
|     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") | ||||
|     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") | ||||
|     def test_is_ocred(self, m2, m): | ||||
|         parser = RasterisedDocumentParser("", uuid.uuid4()) | ||||
|         m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ | ||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " \ | ||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " | ||||
|         parser.get_text() | ||||
|         self.assertEqual(m.call_count, 2) | ||||
|         self.assertEqual(m2.call_count, 0) | ||||
|         self.assertContainsStrings(text.strip(), ["This is a test document."]) | ||||
|  | ||||
|     def test_thumbnail(self): | ||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) | ||||
|         parser.get_thumbnail() | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") | ||||
|         # dont really know how to test it, just call it and assert that it does not raise anything. | ||||
|  | ||||
|     @mock.patch("paperless_tesseract.parsers.run_convert") | ||||
| @@ -216,6 +89,161 @@ class TestAuxilliaryFunctions(TestCase): | ||||
|  | ||||
|         m.side_effect = call_convert | ||||
|  | ||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) | ||||
|         parser.get_thumbnail() | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") | ||||
|         # dont really know how to test it, just call it and assert that it does not raise anything. | ||||
|  | ||||
|     def test_get_dpi(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) | ||||
|         self.assertEqual(dpi, None) | ||||
|  | ||||
|         dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png")) | ||||
|         self.assertEqual(dpi, 72) | ||||
|  | ||||
|     def test_simple_digital(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["This is a test document."]) | ||||
|  | ||||
|     def test_with_form(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|  | ||||
|     @override_settings(OCR_MODE="redo") | ||||
|     def test_with_form_error(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|  | ||||
|     @override_settings(OCR_MODE="redo") | ||||
|     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None) | ||||
|     def test_with_form_error_notext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|             parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
|     @override_settings(OCR_MODE="force") | ||||
|     def test_with_form_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|  | ||||
|     def test_image_simple(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["This is a test document."]) | ||||
|  | ||||
|     def test_image_simple_alpha_fail(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|             parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png") | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
|  | ||||
|     def test_image_no_dpi_fail(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|             parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
|     @override_settings(OCR_IMAGE_DPI=72) | ||||
|     def test_image_no_dpi_default(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."]) | ||||
|  | ||||
|     def test_multi_page(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="skip") | ||||
|     def test_multi_page_pages_skip(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||
|     def test_multi_page_pages_redo(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="force") | ||||
|     def test_multi_page_pages_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OOCR_MODE="skip") | ||||
|     def test_multi_page_analog_pages_skip(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||
|     def test_multi_page_analog_pages_redo(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) | ||||
|         self.assertFalse("page 3" in parser.get_text().lower()) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=1, OCR_MODE="force") | ||||
|     def test_multi_page_analog_pages_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1"]) | ||||
|         self.assertFalse("page 2" in parser.get_text().lower()) | ||||
|         self.assertFalse("page 3" in parser.get_text().lower()) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_skip_noarchive_withtext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_skip_noarchive_notext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         self.assertTrue(os.path.join(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|   | ||||
| @@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser): | ||||
|     This parser directly parses a text document (.txt, .md, or .csv) | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, path, logging_group): | ||||
|         super().__init__(path, logging_group) | ||||
|         self._text = None | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|     def get_thumbnail(self, document_path, mime_type): | ||||
|         """ | ||||
|         The thumbnail of a text file is just a 500px wide image of the text | ||||
|         rendered onto a letter-sized page. | ||||
| @@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser): | ||||
|             ) | ||||
|  | ||||
|         def read_text(): | ||||
|             with open(self.document_path, 'r') as src: | ||||
|             with open(document_path, 'r') as src: | ||||
|                 lines = [line.strip() for line in src.readlines()] | ||||
|                 text = "\n".join([line for line in lines[:n_lines]]) | ||||
|                 return text.replace('"', "'") | ||||
| @@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser): | ||||
|  | ||||
|         return out_path | ||||
|  | ||||
|     def get_text(self): | ||||
|  | ||||
|         if self._text is not None: | ||||
|             return self._text | ||||
|  | ||||
|         with open(self.document_path, 'r') as f: | ||||
|             self._text = f.read() | ||||
|  | ||||
|         return self._text | ||||
|     def parse(self, document_path, mime_type): | ||||
|         with open(document_path, 'r') as f: | ||||
|             self.text = f.read() | ||||
|  | ||||
|  | ||||
| def run_command(*args): | ||||
|   | ||||
 jonaswinkler
					jonaswinkler