Merge branch 'feature-ocrmypdf' into dev
							
								
								
									
										13
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						| @@ -76,16 +76,11 @@ scripts/nuke | |||||||
| /static/ | /static/ | ||||||
|  |  | ||||||
| # Stored PDFs | # Stored PDFs | ||||||
| /media/documents/originals/* | /media/ | ||||||
| /media/documents/thumbnails/* | /data/ | ||||||
|  |  | ||||||
| /data/classification_model.pickle |  | ||||||
| /data/db.sqlite3 |  | ||||||
| /data/index |  | ||||||
|  |  | ||||||
| /paperless.conf | /paperless.conf | ||||||
| /consume | /consume/ | ||||||
| /export | /export/ | ||||||
| /src-ui/.vscode | /src-ui/.vscode | ||||||
|  |  | ||||||
| # this is where the compiled frontend is moved to. | # this is where the compiled frontend is moved to. | ||||||
|   | |||||||
| @@ -1,5 +1,8 @@ | |||||||
| language: python | language: python | ||||||
|  |  | ||||||
|  | dist: focal | ||||||
|  | os: linux | ||||||
|  |  | ||||||
| jobs: | jobs: | ||||||
|   include: |   include: | ||||||
|     - name: "Paperless on Python 3.6" |     - name: "Paperless on Python 3.6" | ||||||
| @@ -33,7 +36,7 @@ jobs: | |||||||
|  |  | ||||||
| before_install: | before_install: | ||||||
|   - sudo apt-get update -qq |   - sudo apt-get update -qq | ||||||
|   - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript |   - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript optipng | ||||||
|  |  | ||||||
| install: | install: | ||||||
|   - pip install --upgrade pipenv |   - pip install --upgrade pipenv | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								Pipfile
									
									
									
									
									
								
							
							
						
						| @@ -26,7 +26,6 @@ langdetect = "*" | |||||||
| pdftotext = "*" | pdftotext = "*" | ||||||
| pathvalidate = "*" | pathvalidate = "*" | ||||||
| pillow = "*" | pillow = "*" | ||||||
| pyocr = "~=0.7.2" |  | ||||||
| python-gnupg = "*" | python-gnupg = "*" | ||||||
| python-dotenv = "*" | python-dotenv = "*" | ||||||
| python-dateutil = "*" | python-dateutil = "*" | ||||||
| @@ -39,6 +38,7 @@ whitenoise = "~=5.2.0" | |||||||
| watchdog = "*" | watchdog = "*" | ||||||
| whoosh="~=2.7.4" | whoosh="~=2.7.4" | ||||||
| inotifyrecursive = ">=0.3.4" | inotifyrecursive = ">=0.3.4" | ||||||
|  | ocrmypdf = "*" | ||||||
|  |  | ||||||
| [dev-packages] | [dev-packages] | ||||||
| coveralls = "*" | coveralls = "*" | ||||||
|   | |||||||
							
								
								
									
										298
									
								
								Pipfile.lock
									
									
									
										generated
									
									
									
								
							
							
						
						| @@ -1,7 +1,7 @@ | |||||||
| { | { | ||||||
|     "_meta": { |     "_meta": { | ||||||
|         "hash": { |         "hash": { | ||||||
|             "sha256": "d266e1f67e3090ec68aa8ecba1e8373351daf89ad5a5ab46524d123bcaf29f62" |             "sha256": "55c9136777e78d6cd362628cd1fc0c5ff36b437699b92089ce504d598004371d" | ||||||
|         }, |         }, | ||||||
|         "pipfile-spec": 6, |         "pipfile-spec": 6, | ||||||
|         "requires": { |         "requires": { | ||||||
| @@ -44,6 +44,94 @@ | |||||||
|             ], |             ], | ||||||
|             "version": "==1.17.12" |             "version": "==1.17.12" | ||||||
|         }, |         }, | ||||||
|  |         "cffi": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e", | ||||||
|  |                 "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d", | ||||||
|  |                 "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a", | ||||||
|  |                 "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec", | ||||||
|  |                 "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362", | ||||||
|  |                 "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668", | ||||||
|  |                 "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c", | ||||||
|  |                 "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b", | ||||||
|  |                 "sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654", | ||||||
|  |                 "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06", | ||||||
|  |                 "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698", | ||||||
|  |                 "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2", | ||||||
|  |                 "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c", | ||||||
|  |                 "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7", | ||||||
|  |                 "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009", | ||||||
|  |                 "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03", | ||||||
|  |                 "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b", | ||||||
|  |                 "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909", | ||||||
|  |                 "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53", | ||||||
|  |                 "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35", | ||||||
|  |                 "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26", | ||||||
|  |                 "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b", | ||||||
|  |                 "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb", | ||||||
|  |                 "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293", | ||||||
|  |                 "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd", | ||||||
|  |                 "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d", | ||||||
|  |                 "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3", | ||||||
|  |                 "sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5", | ||||||
|  |                 "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d", | ||||||
|  |                 "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca", | ||||||
|  |                 "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d", | ||||||
|  |                 "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775", | ||||||
|  |                 "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375", | ||||||
|  |                 "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b", | ||||||
|  |                 "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b", | ||||||
|  |                 "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f" | ||||||
|  |             ], | ||||||
|  |             "version": "==1.14.4" | ||||||
|  |         }, | ||||||
|  |         "chardet": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", | ||||||
|  |                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version >= '3.1'", | ||||||
|  |             "version": "==3.0.4" | ||||||
|  |         }, | ||||||
|  |         "coloredlogs": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a", | ||||||
|  |                 "sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505", | ||||||
|  |                 "sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", | ||||||
|  |             "version": "==14.0" | ||||||
|  |         }, | ||||||
|  |         "cryptography": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538", | ||||||
|  |                 "sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f", | ||||||
|  |                 "sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d", | ||||||
|  |                 "sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77", | ||||||
|  |                 "sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b", | ||||||
|  |                 "sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33", | ||||||
|  |                 "sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e", | ||||||
|  |                 "sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb", | ||||||
|  |                 "sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e", | ||||||
|  |                 "sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b", | ||||||
|  |                 "sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7", | ||||||
|  |                 "sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297", | ||||||
|  |                 "sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d", | ||||||
|  |                 "sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7", | ||||||
|  |                 "sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b", | ||||||
|  |                 "sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7", | ||||||
|  |                 "sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4", | ||||||
|  |                 "sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8", | ||||||
|  |                 "sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b", | ||||||
|  |                 "sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851", | ||||||
|  |                 "sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13", | ||||||
|  |                 "sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b", | ||||||
|  |                 "sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3", | ||||||
|  |                 "sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", | ||||||
|  |             "version": "==3.2.1" | ||||||
|  |         }, | ||||||
|         "dateparser": { |         "dateparser": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b", |                 "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b", | ||||||
| @@ -123,6 +211,14 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==20.0.4" |             "version": "==20.0.4" | ||||||
|         }, |         }, | ||||||
|  |         "humanfriendly": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12", | ||||||
|  |                 "sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", | ||||||
|  |             "version": "==8.2" | ||||||
|  |         }, | ||||||
|         "imap-tools": { |         "imap-tools": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246", |                 "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246", | ||||||
| @@ -131,6 +227,13 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==0.32.0" |             "version": "==0.32.0" | ||||||
|         }, |         }, | ||||||
|  |         "img2pdf": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5", | ||||||
|  |                 "sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9" | ||||||
|  |             ], | ||||||
|  |             "version": "==0.4.0" | ||||||
|  |         }, | ||||||
|         "inotify-simple": { |         "inotify-simple": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128", |                 "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128", | ||||||
| @@ -164,6 +267,51 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==1.0.8" |             "version": "==1.0.8" | ||||||
|         }, |         }, | ||||||
|  |         "lxml": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d", | ||||||
|  |                 "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37", | ||||||
|  |                 "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01", | ||||||
|  |                 "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2", | ||||||
|  |                 "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644", | ||||||
|  |                 "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75", | ||||||
|  |                 "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80", | ||||||
|  |                 "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2", | ||||||
|  |                 "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780", | ||||||
|  |                 "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98", | ||||||
|  |                 "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308", | ||||||
|  |                 "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf", | ||||||
|  |                 "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388", | ||||||
|  |                 "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d", | ||||||
|  |                 "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3", | ||||||
|  |                 "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8", | ||||||
|  |                 "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af", | ||||||
|  |                 "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2", | ||||||
|  |                 "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e", | ||||||
|  |                 "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939", | ||||||
|  |                 "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03", | ||||||
|  |                 "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d", | ||||||
|  |                 "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a", | ||||||
|  |                 "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5", | ||||||
|  |                 "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a", | ||||||
|  |                 "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711", | ||||||
|  |                 "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf", | ||||||
|  |                 "sha256:91d6dace31b07ab47eeadd3f4384ded2f77b94b30446410cb2c3e660e047f7a7", | ||||||
|  |                 "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089", | ||||||
|  |                 "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505", | ||||||
|  |                 "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b", | ||||||
|  |                 "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f", | ||||||
|  |                 "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc", | ||||||
|  |                 "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e", | ||||||
|  |                 "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931", | ||||||
|  |                 "sha256:e1dbb88a937126ab14d219a000728224702e0ec0fc7ceb7131c53606b7a76772", | ||||||
|  |                 "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc", | ||||||
|  |                 "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe", | ||||||
|  |                 "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", | ||||||
|  |             "version": "==4.6.2" | ||||||
|  |         }, | ||||||
|         "numpy": { |         "numpy": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db", |                 "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db", | ||||||
| @@ -205,6 +353,14 @@ | |||||||
|             "markers": "python_version >= '3.6'", |             "markers": "python_version >= '3.6'", | ||||||
|             "version": "==1.19.4" |             "version": "==1.19.4" | ||||||
|         }, |         }, | ||||||
|  |         "ocrmypdf": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3", | ||||||
|  |                 "sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928" | ||||||
|  |             ], | ||||||
|  |             "index": "pypi", | ||||||
|  |             "version": "==11.3.3" | ||||||
|  |         }, | ||||||
|         "pathtools": { |         "pathtools": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0", |                 "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0", | ||||||
| @@ -220,6 +376,14 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==2.3.0" |             "version": "==2.3.0" | ||||||
|         }, |         }, | ||||||
|  |         "pdfminer.six": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a", | ||||||
|  |                 "sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version >= '3.4'", | ||||||
|  |             "version": "==20201018" | ||||||
|  |         }, | ||||||
|         "pdftotext": { |         "pdftotext": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6" |                 "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6" | ||||||
| @@ -227,6 +391,33 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==2.1.5" |             "version": "==2.1.5" | ||||||
|         }, |         }, | ||||||
|  |         "pikepdf": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:0829bd5dacd73bb4a37e7575bae523f49603479755563c92ddb55c206700cab1", | ||||||
|  |                 "sha256:0d2b631077cd6af6e4d1b396208020705842610a6f13fab489d5f9c47916baa2", | ||||||
|  |                 "sha256:21c98af08fae4ac9fbcad02b613b6768a4ca300fda4cba867f4a4b6f73c2d04b", | ||||||
|  |                 "sha256:2240372fed30124ddc35b0c15a613f2b687a426ea2f150091e0a0c58cca7a495", | ||||||
|  |                 "sha256:2a97f5f1403e058d217d7f6861cf51fca200c5687bce0d052f5f2fa89b5bfa22", | ||||||
|  |                 "sha256:3faaefca0ae80d19891acec8b0dd5e6235f59f2206d82375eb80d090285e9557", | ||||||
|  |                 "sha256:48ef45b64882901c0d69af3b85d16a19bd0f3e95b43e614fefb53521d8caf36c", | ||||||
|  |                 "sha256:5212fe41f2323fc7356ba67caa39737fe13080562cff37bcbb74a8094076c8d0", | ||||||
|  |                 "sha256:56859c32170663c57bd0658189ce44e180533eebe813853446cd6413810be9eb", | ||||||
|  |                 "sha256:5f8fd1cb3478c5534222018aca24fbbd2bc74460c899bda988ec76722c13caa9", | ||||||
|  |                 "sha256:74300a32c41b3d578772f6933f23a88b19f74484185e71e5225ce2f7ea5aea78", | ||||||
|  |                 "sha256:8cbc946bdd217148f4a9c029fcea62f4ae0f67d5346de4c865f4718cd0ddc37f", | ||||||
|  |                 "sha256:9ceefd30076f732530cf84a1be2ecb2fa9931af932706ded760a6d37c73b96ad", | ||||||
|  |                 "sha256:ad69c170fda41b07a4c6b668a3128e7a759f50d9aebcfcde0ccff1358abe0423", | ||||||
|  |                 "sha256:b715fe182189fb6870fab5b0383bb2fb278c88c46eade346b0f4c1ed8818c09d", | ||||||
|  |                 "sha256:bb01ecf95083ffcb9ad542dc5342ccc1059e46f1395fd966629d36d9cc766b4a", | ||||||
|  |                 "sha256:bd6328547219cf48cefb4e0a1bc54442910594de1c5a5feae847d9ff3c629031", | ||||||
|  |                 "sha256:edb128379bb1dea76b5bdbdacf5657a6e4754bacc2049640762725590d8ed905", | ||||||
|  |                 "sha256:f8e687900557fcd4c51b4e72b9e337fdae9e2c81049d1d80b624bb2e88b5769d", | ||||||
|  |                 "sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52", | ||||||
|  |                 "sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version < '3.9'", | ||||||
|  |             "version": "==2.2.0" | ||||||
|  |         }, | ||||||
|         "pillow": { |         "pillow": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a", |                 "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a", | ||||||
| @@ -262,6 +453,14 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==8.0.1" |             "version": "==8.0.1" | ||||||
|         }, |         }, | ||||||
|  |         "pluggy": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", | ||||||
|  |                 "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||||
|  |             "version": "==0.13.1" | ||||||
|  |         }, | ||||||
|         "psycopg2-binary": { |         "psycopg2-binary": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c", |                 "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c", | ||||||
| @@ -305,13 +504,13 @@ | |||||||
|             "index": "pypi", |             "index": "pypi", | ||||||
|             "version": "==2.8.6" |             "version": "==2.8.6" | ||||||
|         }, |         }, | ||||||
|         "pyocr": { |         "pycparser": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179", |                 "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", | ||||||
|                 "sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301" |                 "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" | ||||||
|             ], |             ], | ||||||
|             "index": "pypi", |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||||
|             "version": "==0.7.2" |             "version": "==2.20" | ||||||
|         }, |         }, | ||||||
|         "python-dateutil": { |         "python-dateutil": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
| @@ -419,6 +618,53 @@ | |||||||
|             ], |             ], | ||||||
|             "version": "==2020.11.13" |             "version": "==2020.11.13" | ||||||
|         }, |         }, | ||||||
|  |         "reportlab": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540", | ||||||
|  |                 "sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c", | ||||||
|  |                 "sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030", | ||||||
|  |                 "sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f", | ||||||
|  |                 "sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5", | ||||||
|  |                 "sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e", | ||||||
|  |                 "sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89", | ||||||
|  |                 "sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2", | ||||||
|  |                 "sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f", | ||||||
|  |                 "sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969", | ||||||
|  |                 "sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53", | ||||||
|  |                 "sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d", | ||||||
|  |                 "sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d", | ||||||
|  |                 "sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457", | ||||||
|  |                 "sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec", | ||||||
|  |                 "sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f", | ||||||
|  |                 "sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778", | ||||||
|  |                 "sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92", | ||||||
|  |                 "sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6", | ||||||
|  |                 "sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077", | ||||||
|  |                 "sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673", | ||||||
|  |                 "sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b", | ||||||
|  |                 "sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48", | ||||||
|  |                 "sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95", | ||||||
|  |                 "sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94", | ||||||
|  |                 "sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589", | ||||||
|  |                 "sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9", | ||||||
|  |                 "sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8", | ||||||
|  |                 "sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d", | ||||||
|  |                 "sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32", | ||||||
|  |                 "sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b", | ||||||
|  |                 "sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a", | ||||||
|  |                 "sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2", | ||||||
|  |                 "sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967", | ||||||
|  |                 "sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5", | ||||||
|  |                 "sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437", | ||||||
|  |                 "sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8", | ||||||
|  |                 "sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab", | ||||||
|  |                 "sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e", | ||||||
|  |                 "sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a", | ||||||
|  |                 "sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8", | ||||||
|  |                 "sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08" | ||||||
|  |             ], | ||||||
|  |             "version": "==3.5.55" | ||||||
|  |         }, | ||||||
|         "scikit-learn": { |         "scikit-learn": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e", |                 "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e", | ||||||
| @@ -482,6 +728,13 @@ | |||||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||||
|             "version": "==1.15.0" |             "version": "==1.15.0" | ||||||
|         }, |         }, | ||||||
|  |         "sortedcontainers": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f", | ||||||
|  |                 "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1" | ||||||
|  |             ], | ||||||
|  |             "version": "==2.3.0" | ||||||
|  |         }, | ||||||
|         "sqlparse": { |         "sqlparse": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0", |                 "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0", | ||||||
| @@ -498,6 +751,14 @@ | |||||||
|             "markers": "python_version >= '3.5'", |             "markers": "python_version >= '3.5'", | ||||||
|             "version": "==2.1.0" |             "version": "==2.1.0" | ||||||
|         }, |         }, | ||||||
|  |         "tqdm": { | ||||||
|  |             "hashes": [ | ||||||
|  |                 "sha256:5c0d04e06ccc0da1bd3fa5ae4550effcce42fcad947b4a6cafa77bdc9b09ff22", | ||||||
|  |                 "sha256:9e7b8ab0ecbdbf0595adadd5f0ebbb9e69010e0bd48bbb0c15e550bf2a5292df" | ||||||
|  |             ], | ||||||
|  |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||||
|  |             "version": "==4.54.0" | ||||||
|  |         }, | ||||||
|         "tzlocal": { |         "tzlocal": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44", |                 "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44", | ||||||
| @@ -589,6 +850,7 @@ | |||||||
|                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", |                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", | ||||||
|                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" |                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" | ||||||
|             ], |             ], | ||||||
|  |             "markers": "python_version >= '3.1'", | ||||||
|             "version": "==3.0.4" |             "version": "==3.0.4" | ||||||
|         }, |         }, | ||||||
|         "coverage": { |         "coverage": { | ||||||
| @@ -711,22 +973,6 @@ | |||||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||||
|             "version": "==1.2.0" |             "version": "==1.2.0" | ||||||
|         }, |         }, | ||||||
|         "importlib-metadata": { |  | ||||||
|             "hashes": [ |  | ||||||
|                 "sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132", |  | ||||||
|                 "sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c" |  | ||||||
|             ], |  | ||||||
|             "markers": "python_version < '3.8'", |  | ||||||
|             "version": "==2.1.0" |  | ||||||
|         }, |  | ||||||
|         "importlib-resources": { |  | ||||||
|             "hashes": [ |  | ||||||
|                 "sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592", |  | ||||||
|                 "sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5" |  | ||||||
|             ], |  | ||||||
|             "markers": "python_version < '3.7'", |  | ||||||
|             "version": "==3.3.0" |  | ||||||
|         }, |  | ||||||
|         "iniconfig": { |         "iniconfig": { | ||||||
|             "hashes": [ |             "hashes": [ | ||||||
|                 "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", |                 "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3", | ||||||
| @@ -1038,14 +1284,6 @@ | |||||||
|             ], |             ], | ||||||
|             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", | ||||||
|             "version": "==20.2.1" |             "version": "==20.2.1" | ||||||
|         }, |  | ||||||
|         "zipp": { |  | ||||||
|             "hashes": [ |  | ||||||
|                 "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108", |  | ||||||
|                 "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb" |  | ||||||
|             ], |  | ||||||
|             "markers": "python_version < '3.8'", |  | ||||||
|             "version": "==3.4.0" |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -152,6 +152,117 @@ PAPERLESS_AUTO_LOGIN_USERNAME=<username> | |||||||
|  |  | ||||||
|     Defaults to none, which disables this feature. |     Defaults to none, which disables this feature. | ||||||
|  |  | ||||||
|  | OCR settings | ||||||
|  | ############ | ||||||
|  |  | ||||||
|  | Paperless uses `OCRmyPDF <https://ocrmypdf.readthedocs.io/en/latest/>`_ for | ||||||
|  | performing OCR on documents and images. Paperless uses sensible defaults for | ||||||
|  | most settings, but all of them can be configured to your needs. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | PAPERLESS_OCR_LANGUAGE=<lang> | ||||||
|  |     Customize the language that paperless will attempt to use when | ||||||
|  |     parsing documents. | ||||||
|  |  | ||||||
|  |     It should be a 3-letter language code consistent with ISO | ||||||
|  |     639: https://www.loc.gov/standards/iso639-2/php/code_list.php | ||||||
|  |  | ||||||
|  |     Set this to the language most of your documents are written in. | ||||||
|  |  | ||||||
|  |     This can be a combination of multiple languages such as ``deu+eng``, | ||||||
|  |     in which case tesseract will use whatever language matches best. | ||||||
|  |     Keep in mind that tesseract uses much more cpu time with multiple | ||||||
|  |     languages enabled. | ||||||
|  |  | ||||||
|  |     Defaults to "eng". | ||||||
|  |  | ||||||
|  | PAPERLESS_OCR_MODE=<mode> | ||||||
|  |     Tell paperless when and how to perform ocr on your documents. Four modes | ||||||
|  |     are available: | ||||||
|  |  | ||||||
|  |     *   ``skip``: Paperless skips all pages and will perform ocr only on pages | ||||||
|  |         where no text is present. This is the safest and fastest option. | ||||||
|  |     *   ``skip_noarchive``: In addition to skip, paperless won't create an | ||||||
|  |         archived version of your documents when it finds any text in them. | ||||||
|  |     *   ``redo``: Paperless will OCR all pages of your documents and attempt to | ||||||
|  |         replace any existing text layers with new text. This will be useful for | ||||||
|  |         documents from scanners that already performed OCR with insufficient | ||||||
|  |         results. It will also perform OCR on purely digital documents. | ||||||
|  |  | ||||||
|  |         This option may fail on some documents that have features that cannot | ||||||
|  |         be removed, such as forms. In this case, the text from the document is | ||||||
|  |         used instead. | ||||||
|  |     *   ``force``: Paperless rasterizes your documents, converting any text | ||||||
|  |         into images and puts the OCRed text on top. This works for all documents, | ||||||
|  |         however, the resulting document may be significantly larger and text | ||||||
|  |         won't appear as sharp when zoomed in. | ||||||
|  |      | ||||||
|  |     The default is ``skip``, which only performs OCR when necessary. | ||||||
|  |  | ||||||
|  | PAPERLESS_OCR_OUTPUT_TYPE=<type> | ||||||
|  |     Specify the the type of PDF documents that paperless should produce. | ||||||
|  |      | ||||||
|  |     *   ``pdf``: Modify the PDF document as little as possible. | ||||||
|  |     *   ``pdfa``: Convert PDF documents into PDF/A-2b documents, which is a | ||||||
|  |         subset of the entire PDF specification and meant for storing | ||||||
|  |         documents long term. | ||||||
|  |     *   ``pdfa-1``, ``pdfa-2``, ``pdfa-3`` to specify the exact version of | ||||||
|  |         PDF/A you wish to use. | ||||||
|  |      | ||||||
|  |     If not specified, ``pdfa`` is used. Remember that paperless also keeps | ||||||
|  |     the original input file as well as the archived version. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | PAPERLESS_OCR_PAGES=<num> | ||||||
|  |     Tells paperless to use only the specified amount of pages for OCR. Documents | ||||||
|  |     with less than the specified amount of pages get OCR'ed completely. | ||||||
|  |  | ||||||
|  |     Specifying 1 here will only use the first page. | ||||||
|  |  | ||||||
|  |     When combined with ``PAPERLESS_OCR_MODE=redo`` or ``PAPERLESS_OCR_MODE=force``, | ||||||
|  |     paperless will not modify any text it finds on excluded pages and copy it | ||||||
|  |     verbatim. | ||||||
|  |  | ||||||
|  |     Defaults to 0, which disables this feature and always uses all pages. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | PAPERLESS_OCR_IMAGE_DPI=<num> | ||||||
|  |     Paperless will OCR any images you put into the system and convert them | ||||||
|  |     into PDF documents. This is useful if your scanner produces images. | ||||||
|  |     In order to do so, paperless needs to know the DPI of the image. | ||||||
|  |     Most images from scanners will have this information embedded and | ||||||
|  |     paperless will detect and use that information. In case this fails, it | ||||||
|  |     uses this value as a fallback. | ||||||
|  |  | ||||||
|  |     Set this to the DPI your scanner produces images at. | ||||||
|  |  | ||||||
|  |     Default is none, which causes paperless to fail if no DPI information is | ||||||
|  |     present in an image. | ||||||
|  |  | ||||||
|  |  | ||||||
|  | PAPERLESS_OCR_USER_ARG=<json> | ||||||
|  |     OCRmyPDF offers many more options. Use this parameter to specify any | ||||||
|  |     additional arguments you wish to pass to OCRmyPDF. Since Paperless uses | ||||||
|  |     the API of OCRmyPDF, you have to specify these in a format that can be | ||||||
|  |     passed to the API. See `https://ocrmypdf.readthedocs.io/en/latest/api.html#reference`_ | ||||||
|  |     for valid parameters. All command line options are supported, but they | ||||||
|  |     use underscores instead of dashed. | ||||||
|  |  | ||||||
|  |     .. caution:: | ||||||
|  |  | ||||||
|  |         Paperless has been tested to work with the OCR options provided | ||||||
|  |         above. There are many options that are incompatible with each other, | ||||||
|  |         so specifying invalid options may prevent paperless from consuming | ||||||
|  |         any documents. | ||||||
|  |  | ||||||
|  |     Specify arguments as a JSON dictionary. Keep note of lower case booleans | ||||||
|  |     and double quoted parameter names and strings. Examples: | ||||||
|  |  | ||||||
|  |     .. code:: json | ||||||
|  |  | ||||||
|  |         {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}     | ||||||
|  |      | ||||||
|  |      | ||||||
| Software tweaks | Software tweaks | ||||||
| ############### | ############### | ||||||
|  |  | ||||||
| @@ -193,37 +304,6 @@ PAPERLESS_TIME_ZONE=<timezone> | |||||||
|     Defaults to UTC. |     Defaults to UTC. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| PAPERLESS_OCR_PAGES=<num> |  | ||||||
|     Tells paperless to use only the specified amount of pages for OCR. Documents |  | ||||||
|     with less than the specified amount of pages get OCR'ed completely. |  | ||||||
|  |  | ||||||
|     Specifying 1 here will only use the first page. |  | ||||||
|  |  | ||||||
|     Defaults to 0, which disables this feature and always uses all pages. |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| PAPERLESS_OCR_LANGUAGE=<lang> |  | ||||||
|     Customize the default language that tesseract will attempt to use when |  | ||||||
|     parsing documents. The default language is used whenever |  | ||||||
|  |  | ||||||
|     * No language could be detected on a document |  | ||||||
|     * No tesseract data files are available for the detected language |  | ||||||
|  |  | ||||||
|     It should be a 3-letter language code consistent with ISO |  | ||||||
|     639: https://www.loc.gov/standards/iso639-2/php/code_list.php |  | ||||||
|  |  | ||||||
|     Set this to the language most of your documents are written in. |  | ||||||
|  |  | ||||||
|     Defaults to "eng". |  | ||||||
|  |  | ||||||
| PAPERLESS_OCR_ALWAYS=<bool> |  | ||||||
|     By default Paperless does not OCR a document if the text can be retrieved from |  | ||||||
|     the document directly. Set to true to always OCR documents. |  | ||||||
|  |  | ||||||
|     Defaults to false. |  | ||||||
|  |  | ||||||
| PAPERLESS_CONSUMER_POLLING=<num> | PAPERLESS_CONSUMER_POLLING=<num> | ||||||
|     If paperless won't find documents added to your consume folder, it might |     If paperless won't find documents added to your consume folder, it might | ||||||
|     not be able to automatically detect filesystem changes. In that case, |     not be able to automatically detect filesystem changes. In that case, | ||||||
| @@ -261,18 +341,6 @@ PAPERLESS_CONVERT_TMPDIR=<path> | |||||||
|  |  | ||||||
|     Default is none, which disables the temporary directory. |     Default is none, which disables the temporary directory. | ||||||
|  |  | ||||||
| PAPERLESS_CONVERT_DENSITY=<num> |  | ||||||
|     This setting has a high impact on the physical size of tmp page files, |  | ||||||
|     the speed of document conversion, and can affect the accuracy of OCR |  | ||||||
|     results. Individual results can vary and this setting should be tested |  | ||||||
|     thoroughly against the documents you are importing to see if it has any |  | ||||||
|     impacts either negative or positive. |  | ||||||
|     Testing on limited document sets has shown a setting of 200 can cut the |  | ||||||
|     size of tmp files by 1/3, and speed up conversion by up to 4x |  | ||||||
|     with little impact to OCR accuracy. |  | ||||||
|  |  | ||||||
|     Default is 300. |  | ||||||
|  |  | ||||||
| PAPERLESS_OPTIMIZE_THUMBNAILS=<bool> | PAPERLESS_OPTIMIZE_THUMBNAILS=<bool> | ||||||
|     Use optipng to optimize thumbnails. This usually reduces the size of |     Use optipng to optimize thumbnails. This usually reduces the size of | ||||||
|     thumbnails by about 20%, but uses considerable compute time during |     thumbnails by about 20%, but uses considerable compute time during | ||||||
| @@ -319,8 +387,5 @@ PAPERLESS_CONVERT_BINARY=<path> | |||||||
| PAPERLESS_GS_BINARY=<path> | PAPERLESS_GS_BINARY=<path> | ||||||
|     Defaults to "/usr/bin/gs". |     Defaults to "/usr/bin/gs". | ||||||
|  |  | ||||||
| PAPERLESS_UNPAPER_BINARY=<path> |  | ||||||
|     Defaults to "/usr/bin/unpaper". |  | ||||||
|  |  | ||||||
| PAPERLESS_OPTIPNG_BINARY=<path> | PAPERLESS_OPTIPNG_BINARY=<path> | ||||||
|     Defaults to "/usr/bin/optipng". |     Defaults to "/usr/bin/optipng". | ||||||
|   | |||||||
| @@ -31,19 +31,24 @@ | |||||||
| #PAPERLESS_STATIC_URL=/static/ | #PAPERLESS_STATIC_URL=/static/ | ||||||
| #PAPERLESS_AUTO_LOGIN_USERNAME= | #PAPERLESS_AUTO_LOGIN_USERNAME= | ||||||
|  |  | ||||||
|  | # OCR settings | ||||||
|  |  | ||||||
|  | #PAPERLESS_OCR_LANGUAGE=eng | ||||||
|  | #PAPERLESS_OCR_MODE=skip | ||||||
|  | #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | ||||||
|  | #PAPERLESS_OCR_PAGES=1 | ||||||
|  | #PAPERLESS_OCR_IMAGE_DPI=300 | ||||||
|  | #PAPERLESS_OCR_USER_ARG={} | ||||||
|  | #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | ||||||
|  | #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless | ||||||
|  |  | ||||||
| # Software tweaks | # Software tweaks | ||||||
|  |  | ||||||
| #PAPERLESS_TASK_WORKERS=1 | #PAPERLESS_TASK_WORKERS=1 | ||||||
| #PAPERLESS_THREADS_PER_WORKER=1 | #PAPERLESS_THREADS_PER_WORKER=1 | ||||||
| #PAPERLESS_TIME_ZONE=UTC | #PAPERLESS_TIME_ZONE=UTC | ||||||
| #PAPERLESS_OCR_PAGES=1 |  | ||||||
| #PAPERLESS_OCR_LANGUAGE=eng |  | ||||||
| #PAPERLESS_OCR_ALWAYS=false |  | ||||||
| #PAPERLESS_CONSUMER_POLLING=10 | #PAPERLESS_CONSUMER_POLLING=10 | ||||||
| #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false | ||||||
| #PAPERLESS_CONVERT_MEMORY_LIMIT=0 |  | ||||||
| #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless |  | ||||||
| #PAPERLESS_CONVERT_DENSITY=300 |  | ||||||
| #PAPERLESS_OPTIMIZE_THUMBNAILS=true | #PAPERLESS_OPTIMIZE_THUMBNAILS=true | ||||||
| #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh | ||||||
| #PAPERLESS_FILENAME_DATE_ORDER=YMD | #PAPERLESS_FILENAME_DATE_ORDER=YMD | ||||||
| @@ -53,5 +58,4 @@ | |||||||
|  |  | ||||||
| #PAPERLESS_CONVERT_BINARY=/usr/bin/convert | #PAPERLESS_CONVERT_BINARY=/usr/bin/convert | ||||||
| #PAPERLESS_GS_BINARY=/usr/bin/gs | #PAPERLESS_GS_BINARY=/usr/bin/gs | ||||||
| #PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper |  | ||||||
| #PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng | #PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng | ||||||
|   | |||||||
| @@ -5,12 +5,26 @@ | |||||||
|         </svg> |         </svg> | ||||||
|         <span class="d-none d-lg-inline"> Delete</span> |         <span class="d-none d-lg-inline"> Delete</span> | ||||||
|     </button> |     </button> | ||||||
|     <a [href]="downloadUrl" class="btn btn-sm btn-outline-primary mr-2"> |  | ||||||
|  |     <div class="btn-group mr-2"> | ||||||
|  |  | ||||||
|  |         <a [href]="downloadUrl" class="btn btn-sm btn-outline-primary"> | ||||||
|             <svg class="buttonicon" fill="currentColor"> |             <svg class="buttonicon" fill="currentColor"> | ||||||
|                 <use xlink:href="assets/bootstrap-icons.svg#download" /> |                 <use xlink:href="assets/bootstrap-icons.svg#download" /> | ||||||
|             </svg> |             </svg> | ||||||
|             <span class="d-none d-lg-inline"> Download</span> |             <span class="d-none d-lg-inline"> Download</span> | ||||||
|         </a> |         </a> | ||||||
|  |      | ||||||
|  |         <div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version"> | ||||||
|  |           <button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button> | ||||||
|  |           <div class="dropdown-menu" ngbDropdownMenu> | ||||||
|  |             <a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a> | ||||||
|  |           </div> | ||||||
|  |         </div> | ||||||
|  |      | ||||||
|  |       </div> | ||||||
|  |  | ||||||
|  |  | ||||||
|     <button type="button" class="btn btn-sm btn-outline-primary" (click)="close()"> |     <button type="button" class="btn btn-sm btn-outline-primary" (click)="close()"> | ||||||
|         <svg class="buttonicon" fill="currentColor"> |         <svg class="buttonicon" fill="currentColor"> | ||||||
|             <use xlink:href="assets/bootstrap-icons.svg#x" /> |             <use xlink:href="assets/bootstrap-icons.svg#x" /> | ||||||
|   | |||||||
| @@ -4,6 +4,7 @@ import { ActivatedRoute, Router } from '@angular/router'; | |||||||
| import { NgbModal } from '@ng-bootstrap/ng-bootstrap'; | import { NgbModal } from '@ng-bootstrap/ng-bootstrap'; | ||||||
| import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent'; | import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent'; | ||||||
| import { PaperlessDocument } from 'src/app/data/paperless-document'; | import { PaperlessDocument } from 'src/app/data/paperless-document'; | ||||||
|  | import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata'; | ||||||
| import { PaperlessDocumentType } from 'src/app/data/paperless-document-type'; | import { PaperlessDocumentType } from 'src/app/data/paperless-document-type'; | ||||||
| import { DocumentListViewService } from 'src/app/services/document-list-view.service'; | import { DocumentListViewService } from 'src/app/services/document-list-view.service'; | ||||||
| import { OpenDocumentsService } from 'src/app/services/open-documents.service'; | import { OpenDocumentsService } from 'src/app/services/open-documents.service'; | ||||||
| @@ -23,9 +24,11 @@ export class DocumentDetailComponent implements OnInit { | |||||||
|  |  | ||||||
|   documentId: number |   documentId: number | ||||||
|   document: PaperlessDocument |   document: PaperlessDocument | ||||||
|  |   metadata: PaperlessDocumentMetadata | ||||||
|   title: string |   title: string | ||||||
|   previewUrl: string |   previewUrl: string | ||||||
|   downloadUrl: string |   downloadUrl: string | ||||||
|  |   downloadOriginalUrl: string | ||||||
|  |  | ||||||
|   correspondents: PaperlessCorrespondent[] |   correspondents: PaperlessCorrespondent[] | ||||||
|   documentTypes: PaperlessDocumentType[] |   documentTypes: PaperlessDocumentType[] | ||||||
| @@ -62,6 +65,7 @@ export class DocumentDetailComponent implements OnInit { | |||||||
|       this.documentId = +paramMap.get('id') |       this.documentId = +paramMap.get('id') | ||||||
|       this.previewUrl = this.documentsService.getPreviewUrl(this.documentId) |       this.previewUrl = this.documentsService.getPreviewUrl(this.documentId) | ||||||
|       this.downloadUrl = this.documentsService.getDownloadUrl(this.documentId) |       this.downloadUrl = this.documentsService.getDownloadUrl(this.documentId) | ||||||
|  |       this.downloadOriginalUrl = this.documentsService.getDownloadUrl(this.documentId, true) | ||||||
|       if (this.openDocumentService.getOpenDocument(this.documentId)) { |       if (this.openDocumentService.getOpenDocument(this.documentId)) { | ||||||
|         this.updateComponent(this.openDocumentService.getOpenDocument(this.documentId)) |         this.updateComponent(this.openDocumentService.getOpenDocument(this.documentId)) | ||||||
|       } else { |       } else { | ||||||
| @@ -76,6 +80,9 @@ export class DocumentDetailComponent implements OnInit { | |||||||
|  |  | ||||||
|   updateComponent(doc: PaperlessDocument) { |   updateComponent(doc: PaperlessDocument) { | ||||||
|     this.document = doc |     this.document = doc | ||||||
|  |     this.documentsService.getMetadata(doc.id).subscribe(result => { | ||||||
|  |       this.metadata = result | ||||||
|  |     }) | ||||||
|     this.title = doc.title |     this.title = doc.title | ||||||
|     this.documentForm.patchValue(doc) |     this.documentForm.patchValue(doc) | ||||||
|   } |   } | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								src-ui/src/app/data/paperless-document-metadata.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,11 @@ | |||||||
|  | export interface PaperlessDocumentMetadata { | ||||||
|  |      | ||||||
|  |   paperless__checksum?: string | ||||||
|  |  | ||||||
|  |   paperless__mime_type?: string | ||||||
|  |  | ||||||
|  |   paperless__filename?: string | ||||||
|  |  | ||||||
|  |   paperless__has_archive_version?: boolean | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -1,5 +1,6 @@ | |||||||
| import { Injectable } from '@angular/core'; | import { Injectable } from '@angular/core'; | ||||||
| import { PaperlessDocument } from 'src/app/data/paperless-document'; | import { PaperlessDocument } from 'src/app/data/paperless-document'; | ||||||
|  | import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata'; | ||||||
| import { AbstractPaperlessService } from './abstract-paperless-service'; | import { AbstractPaperlessService } from './abstract-paperless-service'; | ||||||
| import { HttpClient } from '@angular/common/http'; | import { HttpClient } from '@angular/common/http'; | ||||||
| import { Observable } from 'rxjs'; | import { Observable } from 'rxjs'; | ||||||
| @@ -50,20 +51,32 @@ export class DocumentService extends AbstractPaperlessService<PaperlessDocument> | |||||||
|     return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules)) |     return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules)) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   getPreviewUrl(id: number): string { |   getPreviewUrl(id: number, original: boolean = false): string { | ||||||
|     return this.getResourceUrl(id, 'preview') |     let url = this.getResourceUrl(id, 'preview') | ||||||
|  |     if (original) { | ||||||
|  |       url += "?original=true" | ||||||
|  |     } | ||||||
|  |     return url | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   getThumbUrl(id: number): string { |   getThumbUrl(id: number): string { | ||||||
|     return this.getResourceUrl(id, 'thumb') |     return this.getResourceUrl(id, 'thumb') | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   getDownloadUrl(id: number): string { |   getDownloadUrl(id: number, original: boolean = false): string { | ||||||
|     return this.getResourceUrl(id, 'download') |     let url = this.getResourceUrl(id, 'download') | ||||||
|  |     if (original) { | ||||||
|  |       url += "?original=true" | ||||||
|  |     } | ||||||
|  |     return url | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   uploadDocument(formData) { |   uploadDocument(formData) { | ||||||
|     return this.http.post(this.getResourceUrl(null, 'post_document'), formData) |     return this.http.post(this.getResourceUrl(null, 'post_document'), formData) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   getMetadata(id: number): Observable<PaperlessDocumentMetadata> { | ||||||
|  |     return this.http.get<PaperlessDocumentMetadata>(this.getResourceUrl(id, 'metadata')) | ||||||
|  |   } | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ import os | |||||||
| import magic | import magic | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.db import transaction | from django.db import transaction | ||||||
|  | from django.db.models import Q | ||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
|  |  | ||||||
| from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | from .classifier import DocumentClassifier, IncompatibleClassifierVersionError | ||||||
| @@ -13,7 +14,7 @@ from .file_handling import create_source_path_directory | |||||||
| from .loggers import LoggingMixin | from .loggers import LoggingMixin | ||||||
| from .models import Document, FileInfo, Correspondent, DocumentType, Tag | from .models import Document, FileInfo, Correspondent, DocumentType, Tag | ||||||
| from .parsers import ParseError, get_parser_class_for_mime_type, \ | from .parsers import ParseError, get_parser_class_for_mime_type, \ | ||||||
|     get_supported_file_extensions |     get_supported_file_extensions, parse_date | ||||||
| from .signals import ( | from .signals import ( | ||||||
|     document_consumption_finished, |     document_consumption_finished, | ||||||
|     document_consumption_started |     document_consumption_started | ||||||
| @@ -58,7 +59,7 @@ class Consumer(LoggingMixin): | |||||||
|     def pre_check_duplicate(self): |     def pre_check_duplicate(self): | ||||||
|         with open(self.path, "rb") as f: |         with open(self.path, "rb") as f: | ||||||
|             checksum = hashlib.md5(f.read()).hexdigest() |             checksum = hashlib.md5(f.read()).hexdigest() | ||||||
|         if Document.objects.filter(checksum=checksum).exists(): |         if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501 | ||||||
|             if settings.CONSUMER_DELETE_DUPLICATES: |             if settings.CONSUMER_DELETE_DUPLICATES: | ||||||
|                 os.unlink(self.path) |                 os.unlink(self.path) | ||||||
|             raise ConsumerError( |             raise ConsumerError( | ||||||
| @@ -69,6 +70,7 @@ class Consumer(LoggingMixin): | |||||||
|         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) |         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||||
|         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) |         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) | ||||||
|         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) |         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) | ||||||
|  |         os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) | ||||||
|  |  | ||||||
|     def try_consume_file(self, |     def try_consume_file(self, | ||||||
|                          path, |                          path, | ||||||
| @@ -124,7 +126,7 @@ class Consumer(LoggingMixin): | |||||||
|  |  | ||||||
|         # This doesn't parse the document yet, but gives us a parser. |         # This doesn't parse the document yet, but gives us a parser. | ||||||
|  |  | ||||||
|         document_parser = parser_class(self.path, self.logging_group) |         document_parser = parser_class(self.logging_group) | ||||||
|  |  | ||||||
|         # However, this already created working directories which we have to |         # However, this already created working directories which we have to | ||||||
|         # clean up. |         # clean up. | ||||||
| @@ -132,13 +134,24 @@ class Consumer(LoggingMixin): | |||||||
|         # Parse the document. This may take some time. |         # Parse the document. This may take some time. | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             self.log("debug", f"Generating thumbnail for {self.filename}...") |  | ||||||
|             thumbnail = document_parser.get_optimised_thumbnail() |  | ||||||
|             self.log("debug", "Parsing {}...".format(self.filename)) |             self.log("debug", "Parsing {}...".format(self.filename)) | ||||||
|  |             document_parser.parse(self.path, mime_type) | ||||||
|  |  | ||||||
|  |             self.log("debug", f"Generating thumbnail for {self.filename}...") | ||||||
|  |             thumbnail = document_parser.get_optimised_thumbnail( | ||||||
|  |                 self.path, mime_type) | ||||||
|  |  | ||||||
|             text = document_parser.get_text() |             text = document_parser.get_text() | ||||||
|             date = document_parser.get_date() |             date = document_parser.get_date() | ||||||
|  |             if not date: | ||||||
|  |                 date = parse_date(self.filename, text) | ||||||
|  |             archive_path = document_parser.get_archive_path() | ||||||
|  |  | ||||||
|         except ParseError as e: |         except ParseError as e: | ||||||
|             document_parser.cleanup() |             document_parser.cleanup() | ||||||
|  |             self.log( | ||||||
|  |                 "error", | ||||||
|  |                 f"Error while consuming document {self.filename}: {e}") | ||||||
|             raise ConsumerError(e) |             raise ConsumerError(e) | ||||||
|  |  | ||||||
|         # Prepare the document classifier. |         # Prepare the document classifier. | ||||||
| @@ -180,9 +193,24 @@ class Consumer(LoggingMixin): | |||||||
|                 # After everything is in the database, copy the files into |                 # After everything is in the database, copy the files into | ||||||
|                 # place. If this fails, we'll also rollback the transaction. |                 # place. If this fails, we'll also rollback the transaction. | ||||||
|  |  | ||||||
|  |                 # TODO: not required, since this is done by the file handling | ||||||
|  |                 #  logic | ||||||
|                 create_source_path_directory(document.source_path) |                 create_source_path_directory(document.source_path) | ||||||
|                 self._write(document, self.path, document.source_path) |  | ||||||
|                 self._write(document, thumbnail, document.thumbnail_path) |                 self._write(document.storage_type, | ||||||
|  |                             self.path, document.source_path) | ||||||
|  |  | ||||||
|  |                 self._write(document.storage_type, | ||||||
|  |                             thumbnail, document.thumbnail_path) | ||||||
|  |  | ||||||
|  |                 if archive_path and os.path.isfile(archive_path): | ||||||
|  |                     self._write(document.storage_type, | ||||||
|  |                                 archive_path, document.archive_path) | ||||||
|  |  | ||||||
|  |                     with open(archive_path, 'rb') as f: | ||||||
|  |                         document.archive_checksum = hashlib.md5( | ||||||
|  |                             f.read()).hexdigest() | ||||||
|  |                         document.save() | ||||||
|  |  | ||||||
|                 # Afte performing all database operations and moving files |                 # Afte performing all database operations and moving files | ||||||
|                 # into place, tell paperless where the file is. |                 # into place, tell paperless where the file is. | ||||||
| @@ -195,6 +223,11 @@ class Consumer(LoggingMixin): | |||||||
|                 self.log("debug", "Deleting file {}".format(self.path)) |                 self.log("debug", "Deleting file {}".format(self.path)) | ||||||
|                 os.unlink(self.path) |                 os.unlink(self.path) | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|  |             self.log( | ||||||
|  |                 "error", | ||||||
|  |                 f"The following error occured while consuming " | ||||||
|  |                 f"{self.filename}: {e}" | ||||||
|  |             ) | ||||||
|             raise ConsumerError(e) |             raise ConsumerError(e) | ||||||
|         finally: |         finally: | ||||||
|             document_parser.cleanup() |             document_parser.cleanup() | ||||||
| @@ -259,7 +292,7 @@ class Consumer(LoggingMixin): | |||||||
|             for tag_id in self.override_tag_ids: |             for tag_id in self.override_tag_ids: | ||||||
|                 document.tags.add(Tag.objects.get(pk=tag_id)) |                 document.tags.add(Tag.objects.get(pk=tag_id)) | ||||||
|  |  | ||||||
|     def _write(self, document, source, target): |     def _write(self, storage_type, source, target): | ||||||
|         with open(source, "rb") as read_file: |         with open(source, "rb") as read_file: | ||||||
|             with open(target, "wb") as write_file: |             with open(target, "wb") as write_file: | ||||||
|                 write_file.write(read_file.read()) |                 write_file.write(read_file.read()) | ||||||
|   | |||||||
| @@ -10,10 +10,13 @@ def create_source_path_directory(source_path): | |||||||
|     os.makedirs(os.path.dirname(source_path), exist_ok=True) |     os.makedirs(os.path.dirname(source_path), exist_ok=True) | ||||||
|  |  | ||||||
|  |  | ||||||
| def delete_empty_directories(directory): | def delete_empty_directories(directory, root): | ||||||
|  |     if not os.path.isdir(directory): | ||||||
|  |         return | ||||||
|  |  | ||||||
|     # Go up in the directory hierarchy and try to delete all directories |     # Go up in the directory hierarchy and try to delete all directories | ||||||
|     directory = os.path.normpath(directory) |     directory = os.path.normpath(directory) | ||||||
|     root = os.path.normpath(settings.ORIGINALS_DIR) |     root = os.path.normpath(root) | ||||||
|  |  | ||||||
|     if not directory.startswith(root + os.path.sep): |     if not directory.startswith(root + os.path.sep): | ||||||
|         # don't do anything outside our originals folder. |         # don't do anything outside our originals folder. | ||||||
| @@ -101,3 +104,8 @@ def generate_filename(doc): | |||||||
|         filename += ".gpg" |         filename += ".gpg" | ||||||
|  |  | ||||||
|     return filename |     return filename | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def archive_name_from_filename(filename): | ||||||
|  |  | ||||||
|  |     return os.path.splitext(filename)[0] + ".pdf" | ||||||
|   | |||||||
							
								
								
									
										89
									
								
								src/documents/management/commands/document_archiver.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,89 @@ | |||||||
|  | import hashlib | ||||||
|  | import multiprocessing | ||||||
|  |  | ||||||
|  | import logging | ||||||
|  | import os | ||||||
|  | import shutil | ||||||
|  | import uuid | ||||||
|  |  | ||||||
|  | from django.conf import settings | ||||||
|  | from django.core.management.base import BaseCommand | ||||||
|  | from whoosh.writing import AsyncWriter | ||||||
|  |  | ||||||
|  | from documents.models import Document | ||||||
|  | from ... import index | ||||||
|  | from ...mixins import Renderable | ||||||
|  | from ...parsers import get_parser_class_for_mime_type | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def handle_document(document): | ||||||
|  |     mime_type = document.mime_type | ||||||
|  |  | ||||||
|  |     parser_class = get_parser_class_for_mime_type(mime_type) | ||||||
|  |  | ||||||
|  |     parser = parser_class(logging_group=uuid.uuid4()) | ||||||
|  |     parser.parse(document.source_path, mime_type) | ||||||
|  |     if parser.get_archive_path(): | ||||||
|  |         shutil.copy(parser.get_archive_path(), document.archive_path) | ||||||
|  |         with document.archive_file as f: | ||||||
|  |             document.archive_checksum = hashlib.md5(f.read()).hexdigest() | ||||||
|  |     else: | ||||||
|  |         logging.getLogger(__name__).warning( | ||||||
|  |             f"Parser {parser} did not produce an archived document " | ||||||
|  |             f"for {document.file_name}" | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     if parser.get_text(): | ||||||
|  |         document.content = parser.get_text() | ||||||
|  |     document.save() | ||||||
|  |  | ||||||
|  |     parser.cleanup() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Command(Renderable, BaseCommand): | ||||||
|  |  | ||||||
|  |     help = """ | ||||||
|  |         Using the current classification model, assigns correspondents, tags | ||||||
|  |         and document types to all documents, effectively allowing you to | ||||||
|  |         back-tag all previously indexed documents with metadata created (or | ||||||
|  |         modified) after their initial import. | ||||||
|  |     """.replace("    ", "") | ||||||
|  |  | ||||||
|  |     def __init__(self, *args, **kwargs): | ||||||
|  |         self.verbosity = 0 | ||||||
|  |         BaseCommand.__init__(self, *args, **kwargs) | ||||||
|  |  | ||||||
|  |     def add_arguments(self, parser): | ||||||
|  |         parser.add_argument( | ||||||
|  |             "-f", "--overwrite", | ||||||
|  |             default=False, | ||||||
|  |             action="store_true", | ||||||
|  |             help="Recreates the archived document for documents that already " | ||||||
|  |                  "have an archived version." | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def handle(self, *args, **options): | ||||||
|  |  | ||||||
|  |         os.makedirs(settings.SCRATCH_DIR, exist_ok=True) | ||||||
|  |  | ||||||
|  |         overwrite = options["overwrite"] | ||||||
|  |  | ||||||
|  |         documents = Document.objects.all() | ||||||
|  |  | ||||||
|  |         documents_to_process = filter( | ||||||
|  |             lambda d: overwrite or not os.path.exists(d.archive_path), | ||||||
|  |             documents | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |         with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool: | ||||||
|  |             list( | ||||||
|  |                 pool.imap( | ||||||
|  |                     handle_document, | ||||||
|  |                     list(documents_to_process) | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |         ix = index.open_index() | ||||||
|  |         with AsyncWriter(ix) as writer: | ||||||
|  |             for d in documents_to_process: | ||||||
|  |                 index.update_document(writer, d) | ||||||
| @@ -7,7 +7,8 @@ from django.core import serializers | |||||||
| from django.core.management.base import BaseCommand, CommandError | from django.core.management.base import BaseCommand, CommandError | ||||||
|  |  | ||||||
| from documents.models import Document, Correspondent, Tag, DocumentType | from documents.models import Document, Correspondent, Tag, DocumentType | ||||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME | from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ | ||||||
|  |     EXPORTER_ARCHIVE_NAME | ||||||
| from paperless.db import GnuPG | from paperless.db import GnuPG | ||||||
| from ...mixins import Renderable | from ...mixins import Renderable | ||||||
|  |  | ||||||
| @@ -54,7 +55,6 @@ class Command(Renderable, BaseCommand): | |||||||
|             document = document_map[document_dict["pk"]] |             document = document_map[document_dict["pk"]] | ||||||
|  |  | ||||||
|             unique_filename = f"{document.pk:07}_{document.file_name}" |             unique_filename = f"{document.pk:07}_{document.file_name}" | ||||||
|  |  | ||||||
|             file_target = os.path.join(self.target, unique_filename) |             file_target = os.path.join(self.target, unique_filename) | ||||||
|  |  | ||||||
|             thumbnail_name = unique_filename + "-thumbnail.png" |             thumbnail_name = unique_filename + "-thumbnail.png" | ||||||
| @@ -63,6 +63,14 @@ class Command(Renderable, BaseCommand): | |||||||
|             document_dict[EXPORTER_FILE_NAME] = unique_filename |             document_dict[EXPORTER_FILE_NAME] = unique_filename | ||||||
|             document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name |             document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name | ||||||
|  |  | ||||||
|  |             if os.path.exists(document.archive_path): | ||||||
|  |                 archive_name = \ | ||||||
|  |                     f"{document.pk:07}_archive_{document.archive_file_name}" | ||||||
|  |                 archive_target = os.path.join(self.target, archive_name) | ||||||
|  |                 document_dict[EXPORTER_ARCHIVE_NAME] = archive_name | ||||||
|  |             else: | ||||||
|  |                 archive_target = None | ||||||
|  |  | ||||||
|             print(f"Exporting: {file_target}") |             print(f"Exporting: {file_target}") | ||||||
|  |  | ||||||
|             t = int(time.mktime(document.created.timetuple())) |             t = int(time.mktime(document.created.timetuple())) | ||||||
| @@ -76,11 +84,18 @@ class Command(Renderable, BaseCommand): | |||||||
|                     f.write(GnuPG.decrypted(document.thumbnail_file)) |                     f.write(GnuPG.decrypted(document.thumbnail_file)) | ||||||
|                     os.utime(thumbnail_target, times=(t, t)) |                     os.utime(thumbnail_target, times=(t, t)) | ||||||
|  |  | ||||||
|  |                 if archive_target: | ||||||
|  |                     with open(archive_target, "wb") as f: | ||||||
|  |                         f.write(GnuPG.decrypted(document.archive_path)) | ||||||
|  |                         os.utime(archive_target, times=(t, t)) | ||||||
|             else: |             else: | ||||||
|  |  | ||||||
|                 shutil.copy(document.source_path, file_target) |                 shutil.copy(document.source_path, file_target) | ||||||
|                 shutil.copy(document.thumbnail_path, thumbnail_target) |                 shutil.copy(document.thumbnail_path, thumbnail_target) | ||||||
|  |  | ||||||
|  |                 if archive_target: | ||||||
|  |                     shutil.copy(document.archive_path, archive_target) | ||||||
|  |  | ||||||
|         manifest += json.loads( |         manifest += json.loads( | ||||||
|             serializers.serialize("json", Correspondent.objects.all())) |             serializers.serialize("json", Correspondent.objects.all())) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -7,8 +7,8 @@ from django.core.management import call_command | |||||||
| from django.core.management.base import BaseCommand, CommandError | from django.core.management.base import BaseCommand, CommandError | ||||||
|  |  | ||||||
| from documents.models import Document | from documents.models import Document | ||||||
| from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME | from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ | ||||||
| from paperless.db import GnuPG |     EXPORTER_ARCHIVE_NAME | ||||||
| from ...file_handling import generate_filename, create_source_path_directory | from ...file_handling import generate_filename, create_source_path_directory | ||||||
| from ...mixins import Renderable | from ...mixins import Renderable | ||||||
|  |  | ||||||
| @@ -79,23 +79,41 @@ class Command(Renderable, BaseCommand): | |||||||
|                     'appear to be in the source directory.'.format(doc_file) |                     'appear to be in the source directory.'.format(doc_file) | ||||||
|                 ) |                 ) | ||||||
|  |  | ||||||
|  |             if EXPORTER_ARCHIVE_NAME in record: | ||||||
|  |                 archive_file = record[EXPORTER_ARCHIVE_NAME] | ||||||
|  |                 if not os.path.exists(os.path.join(self.source, archive_file)): | ||||||
|  |                     raise CommandError( | ||||||
|  |                         f"The manifest file refers to {archive_file} which " | ||||||
|  |                         f"does not appear to be in the source directory." | ||||||
|  |                     ) | ||||||
|  |  | ||||||
|     def _import_files_from_manifest(self): |     def _import_files_from_manifest(self): | ||||||
|  |  | ||||||
|         storage_type = Document.STORAGE_TYPE_UNENCRYPTED |         os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) | ||||||
|  |         os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) | ||||||
|  |         os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) | ||||||
|  |  | ||||||
|         for record in self.manifest: |         for record in self.manifest: | ||||||
|  |  | ||||||
|             if not record["model"] == "documents.document": |             if not record["model"] == "documents.document": | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             doc_file = record[EXPORTER_FILE_NAME] |  | ||||||
|             thumb_file = record[EXPORTER_THUMBNAIL_NAME] |  | ||||||
|             document = Document.objects.get(pk=record["pk"]) |             document = Document.objects.get(pk=record["pk"]) | ||||||
|  |  | ||||||
|  |             doc_file = record[EXPORTER_FILE_NAME] | ||||||
|             document_path = os.path.join(self.source, doc_file) |             document_path = os.path.join(self.source, doc_file) | ||||||
|  |  | ||||||
|  |             thumb_file = record[EXPORTER_THUMBNAIL_NAME] | ||||||
|             thumbnail_path = os.path.join(self.source, thumb_file) |             thumbnail_path = os.path.join(self.source, thumb_file) | ||||||
|  |  | ||||||
|             document.storage_type = storage_type |             if EXPORTER_ARCHIVE_NAME in record: | ||||||
|  |                 archive_file = record[EXPORTER_ARCHIVE_NAME] | ||||||
|  |                 archive_path = os.path.join(self.source, archive_file) | ||||||
|  |             else: | ||||||
|  |                 archive_path = None | ||||||
|  |  | ||||||
|  |             document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||||
|  |  | ||||||
|             document.filename = generate_filename(document) |             document.filename = generate_filename(document) | ||||||
|  |  | ||||||
|             if os.path.isfile(document.source_path): |             if os.path.isfile(document.source_path): | ||||||
| @@ -106,5 +124,7 @@ class Command(Renderable, BaseCommand): | |||||||
|             print(f"Moving {document_path} to {document.source_path}") |             print(f"Moving {document_path} to {document.source_path}") | ||||||
|             shutil.copy(document_path, document.source_path) |             shutil.copy(document_path, document.source_path) | ||||||
|             shutil.copy(thumbnail_path, document.thumbnail_path) |             shutil.copy(thumbnail_path, document.thumbnail_path) | ||||||
|  |             if archive_path: | ||||||
|  |                 shutil.copy(archive_path, document.archive_path) | ||||||
|  |  | ||||||
|             document.save() |             document.save() | ||||||
|   | |||||||
							
								
								
									
										23
									
								
								src/documents/migrations/1005_checksums.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,23 @@ | |||||||
|  | # Generated by Django 3.1.3 on 2020-11-29 00:48 | ||||||
|  |  | ||||||
|  | from django.db import migrations, models | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class Migration(migrations.Migration): | ||||||
|  |  | ||||||
|  |     dependencies = [ | ||||||
|  |         ('documents', '1004_sanity_check_schedule'), | ||||||
|  |     ] | ||||||
|  |  | ||||||
|  |     operations = [ | ||||||
|  |         migrations.AddField( | ||||||
|  |             model_name='document', | ||||||
|  |             name='archive_checksum', | ||||||
|  |             field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True), | ||||||
|  |         ), | ||||||
|  |         migrations.AlterField( | ||||||
|  |             model_name='document', | ||||||
|  |             name='checksum', | ||||||
|  |             field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True), | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
| @@ -11,6 +11,7 @@ from django.db import models | |||||||
| from django.utils import timezone | from django.utils import timezone | ||||||
| from django.utils.text import slugify | from django.utils.text import slugify | ||||||
|  |  | ||||||
|  | from documents.file_handling import archive_name_from_filename | ||||||
| from documents.parsers import get_default_file_extension | from documents.parsers import get_default_file_extension | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -158,9 +159,15 @@ class Document(models.Model): | |||||||
|         max_length=32, |         max_length=32, | ||||||
|         editable=False, |         editable=False, | ||||||
|         unique=True, |         unique=True, | ||||||
|         help_text="The checksum of the original document (before it was " |         help_text="The checksum of the original document." | ||||||
|                   "encrypted).  We use this to prevent duplicate document " |     ) | ||||||
|                   "imports." |  | ||||||
|  |     archive_checksum = models.CharField( | ||||||
|  |         max_length=32, | ||||||
|  |         editable=False, | ||||||
|  |         blank=True, | ||||||
|  |         null=True, | ||||||
|  |         help_text="The checksum of the archived document." | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|     created = models.DateTimeField( |     created = models.DateTimeField( | ||||||
| @@ -225,10 +232,30 @@ class Document(models.Model): | |||||||
|     def source_file(self): |     def source_file(self): | ||||||
|         return open(self.source_path, "rb") |         return open(self.source_path, "rb") | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def archive_path(self): | ||||||
|  |         if self.filename: | ||||||
|  |             fname = archive_name_from_filename(self.filename) | ||||||
|  |         else: | ||||||
|  |             fname = "{:07}.pdf".format(self.pk) | ||||||
|  |  | ||||||
|  |         return os.path.join( | ||||||
|  |             settings.ARCHIVE_DIR, | ||||||
|  |             fname | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def archive_file(self): | ||||||
|  |         return open(self.archive_path, "rb") | ||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def file_name(self): |     def file_name(self): | ||||||
|         return slugify(str(self)) + self.file_type |         return slugify(str(self)) + self.file_type | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def archive_file_name(self): | ||||||
|  |         return slugify(str(self)) + ".pdf" | ||||||
|  |  | ||||||
|     @property |     @property | ||||||
|     def file_type(self): |     def file_type(self): | ||||||
|         return get_default_file_extension(self.mime_type) |         return get_default_file_extension(self.mime_type) | ||||||
|   | |||||||
| @@ -131,73 +131,7 @@ def run_convert(input_file, | |||||||
|         raise ParseError("Convert failed at {}".format(args)) |         raise ParseError("Convert failed at {}".format(args)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def run_unpaper(pnm, logging_group=None): | def parse_date(filename, text): | ||||||
|     pnm_out = pnm.replace(".pnm", ".unpaper.pnm") |  | ||||||
|  |  | ||||||
|     command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, |  | ||||||
|                     pnm_out) |  | ||||||
|  |  | ||||||
|     logger.debug(f"Execute: {' '.join(command_args)}", |  | ||||||
|                  extra={'group': logging_group}) |  | ||||||
|  |  | ||||||
|     if not subprocess.Popen(command_args, |  | ||||||
|                             stdout=subprocess.DEVNULL, |  | ||||||
|                             stderr=subprocess.DEVNULL).wait() == 0: |  | ||||||
|         raise ParseError(f"Unpaper failed at {command_args}") |  | ||||||
|  |  | ||||||
|     return pnm_out |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class ParseError(Exception): |  | ||||||
|     pass |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class DocumentParser(LoggingMixin): |  | ||||||
|     """ |  | ||||||
|     Subclass this to make your own parser.  Have a look at |  | ||||||
|     `paperless_tesseract.parsers` for inspiration. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     def __init__(self, path, logging_group): |  | ||||||
|         super().__init__() |  | ||||||
|         self.logging_group = logging_group |  | ||||||
|         self.document_path = path |  | ||||||
|         self.tempdir = tempfile.mkdtemp( |  | ||||||
|             prefix="paperless-", dir=settings.SCRATCH_DIR) |  | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |  | ||||||
|         """ |  | ||||||
|         Returns the path to a file we can use as a thumbnail for this document. |  | ||||||
|         """ |  | ||||||
|         raise NotImplementedError() |  | ||||||
|  |  | ||||||
|     def optimise_thumbnail(self, in_path): |  | ||||||
|  |  | ||||||
|         if settings.OPTIMIZE_THUMBNAILS: |  | ||||||
|             out_path = os.path.join(self.tempdir, "optipng.png") |  | ||||||
|  |  | ||||||
|             args = (settings.OPTIPNG_BINARY, |  | ||||||
|                     "-silent", "-o5", in_path, "-out", out_path) |  | ||||||
|  |  | ||||||
|             self.log('debug', f"Execute: {' '.join(args)}") |  | ||||||
|  |  | ||||||
|             if not subprocess.Popen(args).wait() == 0: |  | ||||||
|                 raise ParseError("Optipng failed at {}".format(args)) |  | ||||||
|  |  | ||||||
|             return out_path |  | ||||||
|         else: |  | ||||||
|             return in_path |  | ||||||
|  |  | ||||||
|     def get_optimised_thumbnail(self): |  | ||||||
|         return self.optimise_thumbnail(self.get_thumbnail()) |  | ||||||
|  |  | ||||||
|     def get_text(self): |  | ||||||
|         """ |  | ||||||
|         Returns the text from the document and only the text. |  | ||||||
|         """ |  | ||||||
|         raise NotImplementedError() |  | ||||||
|  |  | ||||||
|     def get_date(self): |  | ||||||
|     """ |     """ | ||||||
|     Returns the date of the document. |     Returns the date of the document. | ||||||
|     """ |     """ | ||||||
| @@ -217,15 +151,12 @@ class DocumentParser(LoggingMixin): | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     date = None |     date = None | ||||||
|         date_string = None |  | ||||||
|  |  | ||||||
|     next_year = timezone.now().year + 5  # Arbitrary 5 year future limit |     next_year = timezone.now().year + 5  # Arbitrary 5 year future limit | ||||||
|         title = os.path.basename(self.document_path) |  | ||||||
|  |  | ||||||
|     # if filename date parsing is enabled, search there first: |     # if filename date parsing is enabled, search there first: | ||||||
|     if settings.FILENAME_DATE_ORDER: |     if settings.FILENAME_DATE_ORDER: | ||||||
|             self.log("info", "Checking document title for date") |         for m in re.finditer(DATE_REGEX, filename): | ||||||
|             for m in re.finditer(DATE_REGEX, title): |  | ||||||
|             date_string = m.group(0) |             date_string = m.group(0) | ||||||
|  |  | ||||||
|             try: |             try: | ||||||
| @@ -235,21 +166,8 @@ class DocumentParser(LoggingMixin): | |||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|             if date is not None and next_year > date.year > 1900: |             if date is not None and next_year > date.year > 1900: | ||||||
|                     self.log( |  | ||||||
|                         "info", |  | ||||||
|                         "Detected document date {} based on string {} " |  | ||||||
|                         "from document title" |  | ||||||
|                         "".format(date.isoformat(), date_string) |  | ||||||
|                     ) |  | ||||||
|                 return date |                 return date | ||||||
|  |  | ||||||
|         try: |  | ||||||
|             # getting text after checking filename will save time if only |  | ||||||
|             # looking at the filename instead of the whole text |  | ||||||
|             text = self.get_text() |  | ||||||
|         except ParseError: |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|     # Iterate through all regex matches in text and try to parse the date |     # Iterate through all regex matches in text and try to parse the date | ||||||
|     for m in re.finditer(DATE_REGEX, text): |     for m in re.finditer(DATE_REGEX, text): | ||||||
|         date_string = m.group(0) |         date_string = m.group(0) | ||||||
| @@ -265,19 +183,64 @@ class DocumentParser(LoggingMixin): | |||||||
|         else: |         else: | ||||||
|             date = None |             date = None | ||||||
|  |  | ||||||
|         if date is not None: |  | ||||||
|             self.log( |  | ||||||
|                 "info", |  | ||||||
|                 "Detected document date {} based on string {}".format( |  | ||||||
|                     date.isoformat(), |  | ||||||
|                     date_string |  | ||||||
|                 ) |  | ||||||
|             ) |  | ||||||
|         else: |  | ||||||
|             self.log("info", "Unable to detect date for document") |  | ||||||
|  |  | ||||||
|     return date |     return date | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ParseError(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class DocumentParser(LoggingMixin): | ||||||
|  |     """ | ||||||
|  |     Subclass this to make your own parser.  Have a look at | ||||||
|  |     `paperless_tesseract.parsers` for inspiration. | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def __init__(self, logging_group): | ||||||
|  |         super().__init__() | ||||||
|  |         self.logging_group = logging_group | ||||||
|  |         self.tempdir = tempfile.mkdtemp( | ||||||
|  |             prefix="paperless-", dir=settings.SCRATCH_DIR) | ||||||
|  |  | ||||||
|  |         self.archive_path = None | ||||||
|  |         self.text = None | ||||||
|  |         self.date = None | ||||||
|  |  | ||||||
|  |     def parse(self, document_path, mime_type): | ||||||
|  |         raise NotImplementedError() | ||||||
|  |  | ||||||
|  |     def get_archive_path(self): | ||||||
|  |         return self.archive_path | ||||||
|  |  | ||||||
|  |     def get_thumbnail(self, document_path, mime_type): | ||||||
|  |         """ | ||||||
|  |         Returns the path to a file we can use as a thumbnail for this document. | ||||||
|  |         """ | ||||||
|  |         raise NotImplementedError() | ||||||
|  |  | ||||||
|  |     def get_optimised_thumbnail(self, document_path, mime_type): | ||||||
|  |         thumbnail = self.get_thumbnail(document_path, mime_type) | ||||||
|  |         if settings.OPTIMIZE_THUMBNAILS: | ||||||
|  |             out_path = os.path.join(self.tempdir, "thumb_optipng.png") | ||||||
|  |  | ||||||
|  |             args = (settings.OPTIPNG_BINARY, | ||||||
|  |                     "-silent", "-o5", thumbnail, "-out", out_path) | ||||||
|  |  | ||||||
|  |             self.log('debug', f"Execute: {' '.join(args)}") | ||||||
|  |  | ||||||
|  |             if not subprocess.Popen(args).wait() == 0: | ||||||
|  |                 raise ParseError("Optipng failed at {}".format(args)) | ||||||
|  |  | ||||||
|  |             return out_path | ||||||
|  |         else: | ||||||
|  |             return thumbnail | ||||||
|  |  | ||||||
|  |     def get_text(self): | ||||||
|  |         return self.text | ||||||
|  |  | ||||||
|  |     def get_date(self): | ||||||
|  |         return self.date | ||||||
|  |  | ||||||
|     def cleanup(self): |     def cleanup(self): | ||||||
|         self.log("debug", "Deleting directory {}".format(self.tempdir)) |         self.log("debug", "Deleting directory {}".format(self.tempdir)) | ||||||
|         shutil.rmtree(self.tempdir) |         shutil.rmtree(self.tempdir) | ||||||
|   | |||||||
| @@ -67,20 +67,35 @@ def check_sanity(): | |||||||
|                 f"Original of document {doc.pk} does not exist.")) |                 f"Original of document {doc.pk} does not exist.")) | ||||||
|         else: |         else: | ||||||
|             present_files.remove(os.path.normpath(doc.source_path)) |             present_files.remove(os.path.normpath(doc.source_path)) | ||||||
|             checksum = None |  | ||||||
|             try: |             try: | ||||||
|                 with doc.source_file as f: |                 with doc.source_file as f: | ||||||
|                     checksum = hashlib.md5(f.read()).hexdigest() |                     checksum = hashlib.md5(f.read()).hexdigest() | ||||||
|             except OSError as e: |             except OSError as e: | ||||||
|                 messages.append(SanityError( |                 messages.append(SanityError( | ||||||
|                     f"Cannot read original file of document {doc.pk}: {e}")) |                     f"Cannot read original file of document {doc.pk}: {e}")) | ||||||
|  |             else: | ||||||
|             if checksum and not checksum == doc.checksum: |                 if not checksum == doc.checksum: | ||||||
|                     messages.append(SanityError( |                     messages.append(SanityError( | ||||||
|                         f"Checksum mismatch of document {doc.pk}. " |                         f"Checksum mismatch of document {doc.pk}. " | ||||||
|                         f"Stored: {doc.checksum}, actual: {checksum}." |                         f"Stored: {doc.checksum}, actual: {checksum}." | ||||||
|                     )) |                     )) | ||||||
|  |  | ||||||
|  |         if os.path.isfile(doc.archive_path): | ||||||
|  |             present_files.remove(os.path.normpath(doc.archive_path)) | ||||||
|  |             try: | ||||||
|  |                 with doc.archive_file as f: | ||||||
|  |                     checksum = hashlib.md5(f.read()).hexdigest() | ||||||
|  |             except OSError as e: | ||||||
|  |                 messages.append(SanityError( | ||||||
|  |                     f"Cannot read archive file of document {doc.pk}: {e}" | ||||||
|  |                 )) | ||||||
|  |             else: | ||||||
|  |                 if not checksum == doc.archive_checksum: | ||||||
|  |                     messages.append(SanityError( | ||||||
|  |                         f"Checksum mismatch of archive {doc.pk}. " | ||||||
|  |                         f"Stored: {doc.checksum}, actual: {checksum}." | ||||||
|  |                     )) | ||||||
|  |  | ||||||
|         if not doc.content: |         if not doc.content: | ||||||
|             messages.append(SanityWarning( |             messages.append(SanityWarning( | ||||||
|                 f"Document {doc.pk} has no content." |                 f"Document {doc.pk} has no content." | ||||||
|   | |||||||
| @@ -2,3 +2,4 @@ | |||||||
| # for exporting/importing commands | # for exporting/importing commands | ||||||
| EXPORTER_FILE_NAME = "__exported_file_name__" | EXPORTER_FILE_NAME = "__exported_file_name__" | ||||||
| EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__" | EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__" | ||||||
|  | EXPORTER_ARCHIVE_NAME = "__exported_archive_name__" | ||||||
|   | |||||||
| @@ -13,7 +13,7 @@ from rest_framework.reverse import reverse | |||||||
|  |  | ||||||
| from .. import index, matching | from .. import index, matching | ||||||
| from ..file_handling import delete_empty_directories, generate_filename, \ | from ..file_handling import delete_empty_directories, generate_filename, \ | ||||||
|     create_source_path_directory |     create_source_path_directory, archive_name_from_filename | ||||||
| from ..models import Document, Tag | from ..models import Document, Tag | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -169,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs): | |||||||
|  |  | ||||||
| @receiver(models.signals.post_delete, sender=Document) | @receiver(models.signals.post_delete, sender=Document) | ||||||
| def cleanup_document_deletion(sender, instance, using, **kwargs): | def cleanup_document_deletion(sender, instance, using, **kwargs): | ||||||
|     for f in (instance.source_path, instance.thumbnail_path): |     for f in (instance.source_path, | ||||||
|  |               instance.archive_path, | ||||||
|  |               instance.thumbnail_path): | ||||||
|  |         if os.path.isfile(f): | ||||||
|             try: |             try: | ||||||
|                 os.unlink(f) |                 os.unlink(f) | ||||||
|         except FileNotFoundError: |                 logging.getLogger(__name__).debug( | ||||||
|             pass  # The file's already gone, so we're cool with it. |                     f"Deleted file {f}.") | ||||||
|  |             except OSError as e: | ||||||
|  |                 logging.getLogger(__name__).warning( | ||||||
|  |                     f"While deleting document {instance.file_name}, the file " | ||||||
|  |                     f"{f} could not be deleted: {e}" | ||||||
|  |                 ) | ||||||
|  |  | ||||||
|     delete_empty_directories(os.path.dirname(instance.source_path)) |     delete_empty_directories( | ||||||
|  |         os.path.dirname(instance.source_path), | ||||||
|  |         root=settings.ORIGINALS_DIR | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     delete_empty_directories( | ||||||
|  |         os.path.dirname(instance.archive_path), | ||||||
|  |         root=settings.ARCHIVE_DIR | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def validate_move(instance, old_path, new_path): | ||||||
|  |     if not os.path.isfile(old_path): | ||||||
|  |         # Can't do anything if the old file does not exist anymore. | ||||||
|  |         logging.getLogger(__name__).fatal( | ||||||
|  |             f"Document {str(instance)}: File {old_path} has gone.") | ||||||
|  |         return False | ||||||
|  |  | ||||||
|  |     if os.path.isfile(new_path): | ||||||
|  |         # Can't do anything if the new file already exists. Skip updating file. | ||||||
|  |         logging.getLogger(__name__).warning( | ||||||
|  |             f"Document {str(instance)}: Cannot rename file " | ||||||
|  |             f"since target path {new_path} already exists.") | ||||||
|  |         return False | ||||||
|  |  | ||||||
|  |     return True | ||||||
|  |  | ||||||
|  |  | ||||||
| @receiver(models.signals.m2m_changed, sender=Document.tags.through) | @receiver(models.signals.m2m_changed, sender=Document.tags.through) | ||||||
| @@ -183,55 +216,90 @@ def cleanup_document_deletion(sender, instance, using, **kwargs): | |||||||
| def update_filename_and_move_files(sender, instance, **kwargs): | def update_filename_and_move_files(sender, instance, **kwargs): | ||||||
|  |  | ||||||
|     if not instance.filename: |     if not instance.filename: | ||||||
|         # Can't update the filename if there is not filename to begin with |         # Can't update the filename if there is no filename to begin with | ||||||
|         # This happens after the consumer creates a new document. |         # This happens when the consumer creates a new document. | ||||||
|         # The PK needs to be set first by saving the document once. When this |         # The document is modified and saved multiple times, and only after | ||||||
|         # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be |         # everything is done (i.e., the generated filename is final), | ||||||
|         # renamed anyway. In all other cases, instance.filename will be set. |         # filename will be set to the location where the consumer has put | ||||||
|  |         # the file. | ||||||
|  |         # | ||||||
|  |         # This will in turn cause this logic to move the file where it belongs. | ||||||
|         return |         return | ||||||
|  |  | ||||||
|     old_filename = instance.filename |     old_filename = instance.filename | ||||||
|     old_path = instance.source_path |  | ||||||
|     new_filename = generate_filename(instance) |     new_filename = generate_filename(instance) | ||||||
|  |  | ||||||
|     if new_filename == instance.filename: |     if new_filename == instance.filename: | ||||||
|         # Don't do anything if its the same. |         # Don't do anything if its the same. | ||||||
|         return |         return | ||||||
|  |  | ||||||
|     new_path = os.path.join(settings.ORIGINALS_DIR, new_filename) |     old_source_path = instance.source_path | ||||||
|  |     new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename) | ||||||
|  |  | ||||||
|     if not os.path.isfile(old_path): |     if not validate_move(instance, old_source_path, new_source_path): | ||||||
|         # Can't do anything if the old file does not exist anymore. |  | ||||||
|         logging.getLogger(__name__).fatal( |  | ||||||
|             f"Document {str(instance)}: File {old_path} has gone.") |  | ||||||
|         return |         return | ||||||
|  |  | ||||||
|     if os.path.isfile(new_path): |     # archive files are optional, archive checksum tells us if we have one, | ||||||
|         # Can't do anything if the new file already exists. Skip updating file. |     # since this is None for documents without archived files. | ||||||
|         logging.getLogger(__name__).warning( |     if instance.archive_checksum: | ||||||
|             f"Document {str(instance)}: Cannot rename file " |         new_archive_filename = archive_name_from_filename(new_filename) | ||||||
|             f"since target path {new_path} already exists.") |         old_archive_path = instance.archive_path | ||||||
|  |         new_archive_path = os.path.join(settings.ARCHIVE_DIR, | ||||||
|  |                                         new_archive_filename) | ||||||
|  |  | ||||||
|  |         if not validate_move(instance, old_archive_path, new_archive_path): | ||||||
|             return |             return | ||||||
|  |  | ||||||
|     create_source_path_directory(new_path) |         create_source_path_directory(new_archive_path) | ||||||
|  |     else: | ||||||
|  |         old_archive_path = None | ||||||
|  |         new_archive_path = None | ||||||
|  |  | ||||||
|  |     create_source_path_directory(new_source_path) | ||||||
|  |  | ||||||
|     try: |     try: | ||||||
|         os.rename(old_path, new_path) |         os.rename(old_source_path, new_source_path) | ||||||
|  |         if instance.archive_checksum: | ||||||
|  |             os.rename(old_archive_path, new_archive_path) | ||||||
|         instance.filename = new_filename |         instance.filename = new_filename | ||||||
|         # Don't save here to prevent infinite recursion. |         # Don't save here to prevent infinite recursion. | ||||||
|         Document.objects.filter(pk=instance.pk).update(filename=new_filename) |         Document.objects.filter(pk=instance.pk).update(filename=new_filename) | ||||||
|  |  | ||||||
|         logging.getLogger(__name__).debug( |         logging.getLogger(__name__).debug( | ||||||
|             f"Moved file {old_path} to {new_path}.") |             f"Moved file {old_source_path} to {new_source_path}.") | ||||||
|  |  | ||||||
|  |         logging.getLogger(__name__).debug( | ||||||
|  |             f"Moved file {old_archive_path} to {new_archive_path}.") | ||||||
|  |  | ||||||
|     except OSError as e: |     except OSError as e: | ||||||
|         instance.filename = old_filename |         instance.filename = old_filename | ||||||
|  |         # this happens when we can't move a file. If that's the case for the | ||||||
|  |         # archive file, we try our best to revert the changes. | ||||||
|  |         try: | ||||||
|  |             os.rename(new_source_path, old_source_path) | ||||||
|  |             os.rename(new_archive_path, old_archive_path) | ||||||
|  |         except: | ||||||
|  |             # This is fine, since: | ||||||
|  |             # A: if we managed to move source from A to B, we will also manage | ||||||
|  |             #  to move it from B to A. If not, we have a serious issue | ||||||
|  |             #  that's going to get caught by the santiy checker. | ||||||
|  |             #  all files remain in place and will never be overwritten, | ||||||
|  |             #  so this is not the end of the world. | ||||||
|  |             # B: if moving the orignal file failed, nothing has changed anyway. | ||||||
|  |             pass | ||||||
|     except DatabaseError as e: |     except DatabaseError as e: | ||||||
|         os.rename(new_path, old_path) |         os.rename(new_source_path, old_source_path) | ||||||
|  |         if instance.archive_checksum: | ||||||
|  |             os.rename(new_archive_path, old_archive_path) | ||||||
|         instance.filename = old_filename |         instance.filename = old_filename | ||||||
|  |  | ||||||
|     if not os.path.isfile(old_path): |     if not os.path.isfile(old_source_path): | ||||||
|         delete_empty_directories(os.path.dirname(old_path)) |         delete_empty_directories(os.path.dirname(old_source_path), | ||||||
|  |                                  root=settings.ORIGINALS_DIR) | ||||||
|  |  | ||||||
|  |     if old_archive_path and not os.path.isfile(old_archive_path): | ||||||
|  |         delete_empty_directories(os.path.dirname(old_archive_path), | ||||||
|  |                                  root=settings.ARCHIVE_DIR) | ||||||
|  |  | ||||||
|  |  | ||||||
| def set_log_entry(sender, document=None, logging_group=None, **kwargs): | def set_log_entry(sender, document=None, logging_group=None, **kwargs): | ||||||
|   | |||||||
| @@ -12,7 +12,9 @@ from documents.sanity_checker import SanityFailedError | |||||||
|  |  | ||||||
|  |  | ||||||
| def index_optimize(): | def index_optimize(): | ||||||
|     index.open_index().optimize() |     ix = index.open_index() | ||||||
|  |     writer = AsyncWriter(ix) | ||||||
|  |     writer.commit(optimize=True) | ||||||
|  |  | ||||||
|  |  | ||||||
| def index_reindex(): | def index_reindex(): | ||||||
|   | |||||||
| Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/archive/0000001.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -100,6 +100,44 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): | |||||||
|         self.assertEqual(response.status_code, 200) |         self.assertEqual(response.status_code, 200) | ||||||
|         self.assertEqual(response.content, content_thumbnail) |         self.assertEqual(response.content, content_thumbnail) | ||||||
|  |  | ||||||
|  |     def test_download_with_archive(self): | ||||||
|  |  | ||||||
|  |         _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir) | ||||||
|  |  | ||||||
|  |         content = b"This is a test" | ||||||
|  |         content_archive = b"This is the same test but archived" | ||||||
|  |  | ||||||
|  |         with open(filename, "wb") as f: | ||||||
|  |             f.write(content) | ||||||
|  |  | ||||||
|  |         filename = os.path.basename(filename) | ||||||
|  |  | ||||||
|  |         doc = Document.objects.create(title="none", filename=filename, | ||||||
|  |                                       mime_type="application/pdf") | ||||||
|  |  | ||||||
|  |         with open(doc.archive_path, "wb") as f: | ||||||
|  |             f.write(content_archive) | ||||||
|  |  | ||||||
|  |         response = self.client.get('/api/documents/{}/download/'.format(doc.pk)) | ||||||
|  |  | ||||||
|  |         self.assertEqual(response.status_code, 200) | ||||||
|  |         self.assertEqual(response.content, content_archive) | ||||||
|  |  | ||||||
|  |         response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk)) | ||||||
|  |  | ||||||
|  |         self.assertEqual(response.status_code, 200) | ||||||
|  |         self.assertEqual(response.content, content) | ||||||
|  |  | ||||||
|  |         response = self.client.get('/api/documents/{}/preview/'.format(doc.pk)) | ||||||
|  |  | ||||||
|  |         self.assertEqual(response.status_code, 200) | ||||||
|  |         self.assertEqual(response.content, content_archive) | ||||||
|  |  | ||||||
|  |         response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk)) | ||||||
|  |  | ||||||
|  |         self.assertEqual(response.status_code, 200) | ||||||
|  |         self.assertEqual(response.content, content) | ||||||
|  |  | ||||||
|     def test_document_actions_not_existing_file(self): |     def test_document_actions_not_existing_file(self): | ||||||
|  |  | ||||||
|         doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf") |         doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf") | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| import os | import os | ||||||
| import re | import re | ||||||
|  | import shutil | ||||||
| import tempfile | import tempfile | ||||||
| from unittest import mock | from unittest import mock | ||||||
| from unittest.mock import MagicMock | from unittest.mock import MagicMock | ||||||
| @@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase): | |||||||
|  |  | ||||||
| class DummyParser(DocumentParser): | class DummyParser(DocumentParser): | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |     def get_thumbnail(self, document_path, mime_type): | ||||||
|         # not important during tests |         # not important during tests | ||||||
|         raise NotImplementedError() |         raise NotImplementedError() | ||||||
|  |  | ||||||
|     def __init__(self, path, logging_group, scratch_dir): |     def __init__(self, logging_group, scratch_dir, archive_path): | ||||||
|         super(DummyParser, self).__init__(path, logging_group) |         super(DummyParser, self).__init__(logging_group) | ||||||
|         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) |         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) | ||||||
|  |         self.archive_path = archive_path | ||||||
|  |  | ||||||
|     def get_optimised_thumbnail(self): |     def get_optimised_thumbnail(self, document_path, mime_type): | ||||||
|         return self.fake_thumb |         return self.fake_thumb | ||||||
|  |  | ||||||
|     def get_text(self): |     def parse(self, document_path, mime_type): | ||||||
|         return "The Text" |         self.text = "The Text" | ||||||
|  |  | ||||||
|  |  | ||||||
| class FaultyParser(DocumentParser): | class FaultyParser(DocumentParser): | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |     def get_thumbnail(self, document_path, mime_type): | ||||||
|         # not important during tests |         # not important during tests | ||||||
|         raise NotImplementedError() |         raise NotImplementedError() | ||||||
|  |  | ||||||
|     def __init__(self, path, logging_group, scratch_dir): |     def __init__(self, logging_group, scratch_dir): | ||||||
|         super(FaultyParser, self).__init__(path, logging_group) |         super(FaultyParser, self).__init__(logging_group) | ||||||
|         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) |         _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir) | ||||||
|  |  | ||||||
|     def get_optimised_thumbnail(self): |     def get_optimised_thumbnail(self, document_path, mime_type): | ||||||
|         return self.fake_thumb |         return self.fake_thumb | ||||||
|  |  | ||||||
|     def get_text(self): |     def parse(self, document_path, mime_type): | ||||||
|         raise ParseError("Does not compute.") |         raise ParseError("Does not compute.") | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -410,11 +412,11 @@ def fake_magic_from_file(file, mime=False): | |||||||
| @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) | @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) | ||||||
| class TestConsumer(DirectoriesMixin, TestCase): | class TestConsumer(DirectoriesMixin, TestCase): | ||||||
|  |  | ||||||
|     def make_dummy_parser(self, path, logging_group): |     def make_dummy_parser(self, logging_group): | ||||||
|         return DummyParser(path, logging_group, self.dirs.scratch_dir) |         return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file()) | ||||||
|  |  | ||||||
|     def make_faulty_parser(self, path, logging_group): |     def make_faulty_parser(self, logging_group): | ||||||
|         return FaultyParser(path, logging_group, self.dirs.scratch_dir) |         return FaultyParser(logging_group, self.dirs.scratch_dir) | ||||||
|  |  | ||||||
|     def setUp(self): |     def setUp(self): | ||||||
|         super(TestConsumer, self).setUp() |         super(TestConsumer, self).setUp() | ||||||
| @@ -432,8 +434,16 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|         self.consumer = Consumer() |         self.consumer = Consumer() | ||||||
|  |  | ||||||
|     def get_test_file(self): |     def get_test_file(self): | ||||||
|         fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir) |         src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf") | ||||||
|         return f |         dst = os.path.join(self.dirs.scratch_dir, "sample.pdf") | ||||||
|  |         shutil.copy(src, dst) | ||||||
|  |         return dst | ||||||
|  |  | ||||||
|  |     def get_test_archive_file(self): | ||||||
|  |         src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf") | ||||||
|  |         dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf") | ||||||
|  |         shutil.copy(src, dst) | ||||||
|  |         return dst | ||||||
|  |  | ||||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT=None) |     @override_settings(PAPERLESS_FILENAME_FORMAT=None) | ||||||
|     def testNormalOperation(self): |     def testNormalOperation(self): | ||||||
| @@ -455,6 +465,13 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|             document.thumbnail_path |             document.thumbnail_path | ||||||
|         )) |         )) | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile( | ||||||
|  |             document.archive_path | ||||||
|  |         )) | ||||||
|  |  | ||||||
|  |         self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1") | ||||||
|  |         self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b") | ||||||
|  |  | ||||||
|         self.assertFalse(os.path.isfile(filename)) |         self.assertFalse(os.path.isfile(filename)) | ||||||
|  |  | ||||||
|     def testOverrideFilename(self): |     def testOverrideFilename(self): | ||||||
| @@ -502,7 +519,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|  |  | ||||||
|         self.fail("Should throw exception") |         self.fail("Should throw exception") | ||||||
|  |  | ||||||
|     def testDuplicates(self): |     def testDuplicates1(self): | ||||||
|         self.consumer.try_consume_file(self.get_test_file()) |         self.consumer.try_consume_file(self.get_test_file()) | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
| @@ -513,6 +530,21 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|  |  | ||||||
|         self.fail("Should throw exception") |         self.fail("Should throw exception") | ||||||
|  |  | ||||||
|  |     def testDuplicates2(self): | ||||||
|  |         self.consumer.try_consume_file(self.get_test_file()) | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             self.consumer.try_consume_file(self.get_test_archive_file()) | ||||||
|  |         except ConsumerError as e: | ||||||
|  |             self.assertTrue(str(e).endswith("It is a duplicate.")) | ||||||
|  |             return | ||||||
|  |  | ||||||
|  |         self.fail("Should throw exception") | ||||||
|  |  | ||||||
|  |     def testDuplicates3(self): | ||||||
|  |         self.consumer.try_consume_file(self.get_test_archive_file()) | ||||||
|  |         self.consumer.try_consume_file(self.get_test_file()) | ||||||
|  |  | ||||||
|     @mock.patch("documents.parsers.document_consumer_declaration.send") |     @mock.patch("documents.parsers.document_consumer_declaration.send") | ||||||
|     def testNoParsers(self, m): |     def testNoParsers(self, m): | ||||||
|         m.return_value = [] |         m.return_value = [] | ||||||
|   | |||||||
							
								
								
									
										140
									
								
								src/documents/tests/test_date_parsing.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,140 @@ | |||||||
|  | import datetime | ||||||
|  | import os | ||||||
|  | import shutil | ||||||
|  | from unittest import mock | ||||||
|  | from uuid import uuid4 | ||||||
|  |  | ||||||
|  | from dateutil import tz | ||||||
|  | from django.conf import settings | ||||||
|  | from django.test import TestCase, override_settings | ||||||
|  |  | ||||||
|  | from documents.parsers import parse_date | ||||||
|  | from paperless_tesseract.parsers import RasterisedDocumentParser | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TestDate(TestCase): | ||||||
|  |  | ||||||
|  |     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples") | ||||||
|  |     SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) | ||||||
|  |  | ||||||
|  |     def setUp(self): | ||||||
|  |         os.makedirs(self.SCRATCH, exist_ok=True) | ||||||
|  |  | ||||||
|  |     def tearDown(self): | ||||||
|  |         shutil.rmtree(self.SCRATCH) | ||||||
|  |  | ||||||
|  |     def test_date_format_1(self): | ||||||
|  |         text = "lorem ipsum 130218 lorem ipsum" | ||||||
|  |         self.assertEqual(parse_date("", text), None) | ||||||
|  |  | ||||||
|  |     def test_date_format_2(self): | ||||||
|  |         text = "lorem ipsum 2018 lorem ipsum" | ||||||
|  |         self.assertEqual(parse_date("", text), None) | ||||||
|  |  | ||||||
|  |     def test_date_format_3(self): | ||||||
|  |         text = "lorem ipsum 20180213 lorem ipsum" | ||||||
|  |         self.assertEqual(parse_date("", text), None) | ||||||
|  |  | ||||||
|  |     def test_date_format_4(self): | ||||||
|  |         text = "lorem ipsum 13.02.2018 lorem ipsum" | ||||||
|  |         date = parse_date("", text) | ||||||
|  |         self.assertEqual( | ||||||
|  |             date, | ||||||
|  |             datetime.datetime( | ||||||
|  |                 2018, 2, 13, 0, 0, | ||||||
|  |                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def test_date_format_5(self): | ||||||
|  |         text = ( | ||||||
|  |             "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " | ||||||
|  |             "ipsum" | ||||||
|  |         ) | ||||||
|  |         date = parse_date("", text) | ||||||
|  |         self.assertEqual( | ||||||
|  |             date, | ||||||
|  |             datetime.datetime( | ||||||
|  |                 2018, 2, 13, 0, 0, | ||||||
|  |                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def test_date_format_6(self): | ||||||
|  |         text = ( | ||||||
|  |             "lorem ipsum\n" | ||||||
|  |             "Wohnort\n" | ||||||
|  |             "3100\n" | ||||||
|  |             "IBAN\n" | ||||||
|  |             "AT87 4534\n" | ||||||
|  |             "1234\n" | ||||||
|  |             "1234 5678\n" | ||||||
|  |             "BIC\n" | ||||||
|  |             "lorem ipsum" | ||||||
|  |         ) | ||||||
|  |         self.assertEqual(parse_date("", text), None) | ||||||
|  |  | ||||||
|  |     def test_date_format_7(self): | ||||||
|  |         text = ( | ||||||
|  |             "lorem ipsum\n" | ||||||
|  |             "März 2019\n" | ||||||
|  |             "lorem ipsum" | ||||||
|  |         ) | ||||||
|  |         date = parse_date("", text) | ||||||
|  |         self.assertEqual( | ||||||
|  |             date, | ||||||
|  |             datetime.datetime( | ||||||
|  |                 2019, 3, 1, 0, 0, | ||||||
|  |                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def test_date_format_8(self): | ||||||
|  |         text = ( | ||||||
|  |             "lorem ipsum\n" | ||||||
|  |             "Wohnort\n" | ||||||
|  |             "3100\n" | ||||||
|  |             "IBAN\n" | ||||||
|  |             "AT87 4534\n" | ||||||
|  |             "1234\n" | ||||||
|  |             "1234 5678\n" | ||||||
|  |             "BIC\n" | ||||||
|  |             "lorem ipsum\n" | ||||||
|  |             "März 2020" | ||||||
|  |         ) | ||||||
|  |         self.assertEqual( | ||||||
|  |             parse_date("", text), | ||||||
|  |             datetime.datetime( | ||||||
|  |                 2020, 3, 1, 0, 0, | ||||||
|  |                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings(SCRATCH_DIR=SCRATCH) | ||||||
|  |     def test_date_format_9(self): | ||||||
|  |         text = ( | ||||||
|  |             "lorem ipsum\n" | ||||||
|  |             "27. Nullmonth 2020\n" | ||||||
|  |             "März 2020\n" | ||||||
|  |             "lorem ipsum" | ||||||
|  |         ) | ||||||
|  |         self.assertEqual( | ||||||
|  |             parse_date("", text), | ||||||
|  |             datetime.datetime( | ||||||
|  |                 2020, 3, 1, 0, 0, | ||||||
|  |                 tzinfo=tz.gettz(settings.TIME_ZONE) | ||||||
|  |             ) | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def test_crazy_date_past(self, *args): | ||||||
|  |         self.assertIsNone(parse_date("", "01-07-0590 00:00:00")) | ||||||
|  |  | ||||||
|  |     def test_crazy_date_future(self, *args): | ||||||
|  |         self.assertIsNone(parse_date("", "01-07-2350 00:00:00")) | ||||||
|  |  | ||||||
|  |     def test_crazy_date_with_spaces(self, *args): | ||||||
|  |         self.assertIsNone(parse_date("", "20 408000l 2475")) | ||||||
|  |  | ||||||
|  |     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||||
|  |     def test_filename_date_parse_invalid(self, *args): | ||||||
|  |         self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here")) | ||||||
| @@ -1,12 +1,29 @@ | |||||||
|  | import os | ||||||
|  | import shutil | ||||||
|  | import tempfile | ||||||
|  | from pathlib import Path | ||||||
| from unittest import mock | from unittest import mock | ||||||
|  |  | ||||||
| from django.test import TestCase | from django.test import TestCase, override_settings | ||||||
|  |  | ||||||
| from ..models import Document, Correspondent | from ..models import Document, Correspondent | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestDocument(TestCase): | class TestDocument(TestCase): | ||||||
|  |  | ||||||
|  |     def setUp(self) -> None: | ||||||
|  |         self.originals_dir = tempfile.mkdtemp() | ||||||
|  |         self.thumb_dir = tempfile.mkdtemp() | ||||||
|  |  | ||||||
|  |         override_settings( | ||||||
|  |             ORIGINALS_DIR=self.originals_dir, | ||||||
|  |             THUMBNAIL_DIR=self.thumb_dir, | ||||||
|  |         ).enable() | ||||||
|  |  | ||||||
|  |     def tearDown(self) -> None: | ||||||
|  |         shutil.rmtree(self.originals_dir) | ||||||
|  |         shutil.rmtree(self.thumb_dir) | ||||||
|  |  | ||||||
|     def test_file_deletion(self): |     def test_file_deletion(self): | ||||||
|         document = Document.objects.create( |         document = Document.objects.create( | ||||||
|             correspondent=Correspondent.objects.create(name="Test0"), |             correspondent=Correspondent.objects.create(name="Test0"), | ||||||
| @@ -19,6 +36,9 @@ class TestDocument(TestCase): | |||||||
|         file_path = document.source_path |         file_path = document.source_path | ||||||
|         thumb_path = document.thumbnail_path |         thumb_path = document.thumbnail_path | ||||||
|  |  | ||||||
|  |         Path(file_path).touch() | ||||||
|  |         Path(thumb_path).touch() | ||||||
|  |  | ||||||
|         with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink: |         with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink: | ||||||
|             document.delete() |             document.delete() | ||||||
|             mock_unlink.assert_any_call(file_path) |             mock_unlink.assert_any_call(file_path) | ||||||
|   | |||||||
| @@ -2,32 +2,17 @@ import os | |||||||
| import shutil | import shutil | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| from unittest import mock | from unittest import mock | ||||||
| from uuid import uuid4 |  | ||||||
|  |  | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from django.db import DatabaseError | from django.db import DatabaseError | ||||||
| from django.test import TestCase, override_settings | from django.test import TestCase, override_settings | ||||||
|  |  | ||||||
|  | from .utils import DirectoriesMixin | ||||||
| from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories | from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories | ||||||
| from ..models import Document, Correspondent | from ..models import Document, Correspondent | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestDate(TestCase): | class TestFileHandling(DirectoriesMixin, TestCase): | ||||||
|     deletion_list = [] |  | ||||||
|  |  | ||||||
|     def add_to_deletion_list(self, dirname): |  | ||||||
|         self.deletion_list.append(dirname) |  | ||||||
|  |  | ||||||
|     def setUp(self): |  | ||||||
|         folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) |  | ||||||
|         os.makedirs(folder + "/documents/originals") |  | ||||||
|         override_settings(MEDIA_ROOT=folder).enable() |  | ||||||
|         override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable() |  | ||||||
|         self.add_to_deletion_list(folder) |  | ||||||
|  |  | ||||||
|     def tearDown(self): |  | ||||||
|         for dirname in self.deletion_list: |  | ||||||
|             shutil.rmtree(dirname, ignore_errors=True) |  | ||||||
|  |  | ||||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="") |     @override_settings(PAPERLESS_FILENAME_FORMAT="") | ||||||
|     def test_generate_source_filename(self): |     def test_generate_source_filename(self): | ||||||
| @@ -104,7 +89,7 @@ class TestDate(TestCase): | |||||||
|         document.save() |         document.save() | ||||||
|  |  | ||||||
|         # Check proper handling of files |         # Check proper handling of files | ||||||
|         self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) |         self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True) | ||||||
|         self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) |         self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||||
|  |  | ||||||
|         os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) |         os.chmod(settings.ORIGINALS_DIR + "/none", 0o777) | ||||||
| @@ -140,7 +125,7 @@ class TestDate(TestCase): | |||||||
|  |  | ||||||
|             # Check proper handling of files |             # Check proper handling of files | ||||||
|             self.assertTrue(os.path.isfile(document.source_path)) |             self.assertTrue(os.path.isfile(document.source_path)) | ||||||
|             self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True) |             self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True) | ||||||
|             self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) |             self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk)) | ||||||
|  |  | ||||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") |     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}") | ||||||
| @@ -196,8 +181,8 @@ class TestDate(TestCase): | |||||||
|         document.save() |         document.save() | ||||||
|  |  | ||||||
|         # Check proper handling of files |         # Check proper handling of files | ||||||
|         self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True) |         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True) | ||||||
|         self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True) |         self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True) | ||||||
|         self.assertTrue(os.path.isfile(important_file)) |         self.assertTrue(os.path.isfile(important_file)) | ||||||
|  |  | ||||||
|     @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") |     @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}") | ||||||
| @@ -315,13 +300,12 @@ class TestDate(TestCase): | |||||||
|         # Create our working directory |         # Create our working directory | ||||||
|         tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty") |         tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty") | ||||||
|         os.makedirs(tmp) |         os.makedirs(tmp) | ||||||
|         self.add_to_deletion_list(tmp) |  | ||||||
|  |  | ||||||
|         os.makedirs(os.path.join(tmp, "notempty")) |         os.makedirs(os.path.join(tmp, "notempty")) | ||||||
|         Path(os.path.join(tmp, "notempty", "file")).touch() |         Path(os.path.join(tmp, "notempty", "file")).touch() | ||||||
|         os.makedirs(os.path.join(tmp, "notempty", "empty")) |         os.makedirs(os.path.join(tmp, "notempty", "empty")) | ||||||
|  |  | ||||||
|         delete_empty_directories(os.path.join(tmp, "notempty", "empty")) |         delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR) | ||||||
|         self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) |         self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) | ||||||
|         self.assertEqual(os.path.isfile( |         self.assertEqual(os.path.isfile( | ||||||
|             os.path.join(tmp, "notempty", "file")), True) |             os.path.join(tmp, "notempty", "file")), True) | ||||||
| @@ -345,3 +329,159 @@ class TestDate(TestCase): | |||||||
|         document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED |         document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||||
|  |  | ||||||
|         self.assertEqual(generate_filename(document), "0000001.pdf") |         self.assertEqual(generate_filename(document), "0000001.pdf") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TestFileHandlingWithArchive(DirectoriesMixin, TestCase): | ||||||
|  |  | ||||||
|  |     @override_settings(PAPERLESS_FILENAME_FORMAT=None) | ||||||
|  |     def test_create_no_format(self): | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         Path(original).touch() | ||||||
|  |         Path(archive).touch() | ||||||
|  |         doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(original)) | ||||||
|  |         self.assertTrue(os.path.isfile(archive)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|  |  | ||||||
|  |     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||||
|  |     def test_create_with_format(self): | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         Path(original).touch() | ||||||
|  |         Path(archive).touch() | ||||||
|  |         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |  | ||||||
|  |         self.assertFalse(os.path.isfile(original)) | ||||||
|  |         self.assertFalse(os.path.isfile(archive)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|  |         self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf")) | ||||||
|  |         self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")) | ||||||
|  |  | ||||||
|  |     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||||
|  |     def test_move_archive_gone(self): | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         Path(original).touch() | ||||||
|  |         #Path(archive).touch() | ||||||
|  |         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(original)) | ||||||
|  |         self.assertFalse(os.path.isfile(archive)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertFalse(os.path.isfile(doc.archive_path)) | ||||||
|  |  | ||||||
|  |     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||||
|  |     def test_move_archive_exists(self): | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         Path(original).touch() | ||||||
|  |         Path(archive).touch() | ||||||
|  |         os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none")) | ||||||
|  |         Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch() | ||||||
|  |         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(original)) | ||||||
|  |         self.assertTrue(os.path.isfile(archive)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|  |  | ||||||
|  |     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||||
|  |     @mock.patch("documents.signals.handlers.os.rename") | ||||||
|  |     def test_move_archive_error(self, m): | ||||||
|  |  | ||||||
|  |         def fake_rename(src, dst): | ||||||
|  |             if "archive" in src: | ||||||
|  |                 raise OSError() | ||||||
|  |             else: | ||||||
|  |                 os.remove(src) | ||||||
|  |                 Path(dst).touch() | ||||||
|  |  | ||||||
|  |         m.side_effect = fake_rename | ||||||
|  |  | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         Path(original).touch() | ||||||
|  |         Path(archive).touch() | ||||||
|  |         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(original)) | ||||||
|  |         self.assertTrue(os.path.isfile(archive)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|  |  | ||||||
|  |     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||||
|  |     def test_move_file_gone(self): | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         #Path(original).touch() | ||||||
|  |         Path(archive).touch() | ||||||
|  |         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |  | ||||||
|  |         self.assertFalse(os.path.isfile(original)) | ||||||
|  |         self.assertTrue(os.path.isfile(archive)) | ||||||
|  |         self.assertFalse(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|  |  | ||||||
|  |     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||||
|  |     @mock.patch("documents.signals.handlers.os.rename") | ||||||
|  |     def test_move_file_error(self, m): | ||||||
|  |  | ||||||
|  |         def fake_rename(src, dst): | ||||||
|  |             if "original" in src: | ||||||
|  |                 raise OSError() | ||||||
|  |             else: | ||||||
|  |                 os.remove(src) | ||||||
|  |                 Path(dst).touch() | ||||||
|  |  | ||||||
|  |         m.side_effect = fake_rename | ||||||
|  |  | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         Path(original).touch() | ||||||
|  |         Path(archive).touch() | ||||||
|  |         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(original)) | ||||||
|  |         self.assertTrue(os.path.isfile(archive)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|  |  | ||||||
|  |     def test_archive_deleted(self): | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         Path(original).touch() | ||||||
|  |         Path(archive).touch() | ||||||
|  |         doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(original)) | ||||||
|  |         self.assertTrue(os.path.isfile(archive)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|  |  | ||||||
|  |         doc.delete() | ||||||
|  |  | ||||||
|  |         self.assertFalse(os.path.isfile(original)) | ||||||
|  |         self.assertFalse(os.path.isfile(archive)) | ||||||
|  |         self.assertFalse(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertFalse(os.path.isfile(doc.archive_path)) | ||||||
|  |  | ||||||
|  |     @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}") | ||||||
|  |     def test_database_error(self): | ||||||
|  |  | ||||||
|  |         original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf") | ||||||
|  |         archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf") | ||||||
|  |         Path(original).touch() | ||||||
|  |         Path(archive).touch() | ||||||
|  |         doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B") | ||||||
|  |         with mock.patch("documents.signals.handlers.Document.objects.filter") as m: | ||||||
|  |             m.side_effect = DatabaseError() | ||||||
|  |             doc.save() | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(original)) | ||||||
|  |         self.assertTrue(os.path.isfile(archive)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|   | |||||||
							
								
								
									
										42
									
								
								src/documents/tests/test_management_archiver.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -0,0 +1,42 @@ | |||||||
|  | import filecmp | ||||||
|  | import os | ||||||
|  | import shutil | ||||||
|  |  | ||||||
|  | from django.core.management import call_command | ||||||
|  | from django.test import TestCase | ||||||
|  |  | ||||||
|  | from documents.management.commands.document_archiver import handle_document | ||||||
|  | from documents.models import Document | ||||||
|  | from documents.tests.utils import DirectoriesMixin | ||||||
|  |  | ||||||
|  |  | ||||||
|  | sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TestArchiver(DirectoriesMixin, TestCase): | ||||||
|  |  | ||||||
|  |     def make_models(self): | ||||||
|  |         self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf") | ||||||
|  |         #self.d2 = Document.objects.create(checksum="B", title="B", content="second document") | ||||||
|  |         #self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document") | ||||||
|  |  | ||||||
|  |     def test_archiver(self): | ||||||
|  |  | ||||||
|  |         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||||
|  |         self.make_models() | ||||||
|  |  | ||||||
|  |         call_command('document_archiver') | ||||||
|  |  | ||||||
|  |     def test_handle_document(self): | ||||||
|  |  | ||||||
|  |         shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) | ||||||
|  |         self.make_models() | ||||||
|  |  | ||||||
|  |         handle_document(self.d1) | ||||||
|  |  | ||||||
|  |         doc = Document.objects.get(id=self.d1.id) | ||||||
|  |  | ||||||
|  |         self.assertIsNotNone(doc.checksum) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.archive_path)) | ||||||
|  |         self.assertTrue(os.path.isfile(doc.source_path)) | ||||||
|  |         self.assertTrue(filecmp.cmp(sample_file, doc.source_path)) | ||||||
| @@ -23,10 +23,7 @@ class TestExporter(DirectoriesMixin, TestCase): | |||||||
|  |  | ||||||
|         file = os.path.join(self.dirs.originals_dir, "0000001.pdf") |         file = os.path.join(self.dirs.originals_dir, "0000001.pdf") | ||||||
|  |  | ||||||
|         with open(file, "rb") as f: |         Document.objects.create(checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf") | ||||||
|             checksum = hashlib.md5(f.read()).hexdigest() |  | ||||||
|  |  | ||||||
|         Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf") |  | ||||||
|         Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) |         Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) | ||||||
|         Tag.objects.create(name="t") |         Tag.objects.create(name="t") | ||||||
|         DocumentType.objects.create(name="dt") |         DocumentType.objects.create(name="dt") | ||||||
| @@ -51,6 +48,14 @@ class TestExporter(DirectoriesMixin, TestCase): | |||||||
|                     checksum = hashlib.md5(f.read()).hexdigest() |                     checksum = hashlib.md5(f.read()).hexdigest() | ||||||
|                 self.assertEqual(checksum, element['fields']['checksum']) |                 self.assertEqual(checksum, element['fields']['checksum']) | ||||||
|  |  | ||||||
|  |                 if document_exporter.EXPORTER_ARCHIVE_NAME in element: | ||||||
|  |                     fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME]) | ||||||
|  |                     self.assertTrue(os.path.exists(fname)) | ||||||
|  |  | ||||||
|  |                     with open(fname, "rb") as f: | ||||||
|  |                         checksum = hashlib.md5(f.read()).hexdigest() | ||||||
|  |                     self.assertEqual(checksum, element['fields']['archive_checksum']) | ||||||
|  |  | ||||||
|         Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf") |         Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf") | ||||||
|  |  | ||||||
|         self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target) |         self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target) | ||||||
|   | |||||||
| @@ -1,11 +1,13 @@ | |||||||
| import os | import os | ||||||
|  | import shutil | ||||||
|  | import tempfile | ||||||
| from tempfile import TemporaryDirectory | from tempfile import TemporaryDirectory | ||||||
| from unittest import mock | from unittest import mock | ||||||
|  |  | ||||||
| from django.test import TestCase | from django.test import TestCase, override_settings | ||||||
|  |  | ||||||
| from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \ | from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \ | ||||||
|     get_parser_class_for_mime_type |     get_parser_class_for_mime_type, DocumentParser | ||||||
| from paperless_tesseract.parsers import RasterisedDocumentParser | from paperless_tesseract.parsers import RasterisedDocumentParser | ||||||
| from paperless_text.parsers import TextDocumentParser | from paperless_text.parsers import TextDocumentParser | ||||||
|  |  | ||||||
| @@ -66,6 +68,38 @@ class TestParserDiscovery(TestCase): | |||||||
|             ) |             ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def fake_get_thumbnail(self, path, mimetype): | ||||||
|  |     return os.path.join(os.path.dirname(__file__), "examples", "no-text.png") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TestBaseParser(TestCase): | ||||||
|  |  | ||||||
|  |     def setUp(self) -> None: | ||||||
|  |  | ||||||
|  |         self.scratch = tempfile.mkdtemp() | ||||||
|  |         override_settings( | ||||||
|  |             SCRATCH_DIR=self.scratch | ||||||
|  |         ).enable() | ||||||
|  |  | ||||||
|  |     def tearDown(self) -> None: | ||||||
|  |         shutil.rmtree(self.scratch) | ||||||
|  |  | ||||||
|  |     @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) | ||||||
|  |     @override_settings(OPTIMIZE_THUMBNAILS=True) | ||||||
|  |     def test_get_optimised_thumbnail(self): | ||||||
|  |         parser = DocumentParser(None) | ||||||
|  |  | ||||||
|  |         parser.get_optimised_thumbnail("any", "not important") | ||||||
|  |  | ||||||
|  |     @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail) | ||||||
|  |     @override_settings(OPTIMIZE_THUMBNAILS=False) | ||||||
|  |     def test_get_optimised_thumb_disabled(self): | ||||||
|  |         parser = DocumentParser(None) | ||||||
|  |  | ||||||
|  |         path = parser.get_optimised_thumbnail("any", "not important") | ||||||
|  |         self.assertEqual(path, fake_get_thumbnail(None, None, None)) | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestParserAvailability(TestCase): | class TestParserAvailability(TestCase): | ||||||
|  |  | ||||||
|     def test_file_extensions(self): |     def test_file_extensions(self): | ||||||
|   | |||||||
| @@ -17,10 +17,12 @@ def setup_directories(): | |||||||
|     dirs.index_dir = os.path.join(dirs.data_dir, "index") |     dirs.index_dir = os.path.join(dirs.data_dir, "index") | ||||||
|     dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals") |     dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals") | ||||||
|     dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails") |     dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails") | ||||||
|  |     dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive") | ||||||
|  |  | ||||||
|     os.makedirs(dirs.index_dir, exist_ok=True) |     os.makedirs(dirs.index_dir, exist_ok=True) | ||||||
|     os.makedirs(dirs.originals_dir, exist_ok=True) |     os.makedirs(dirs.originals_dir, exist_ok=True) | ||||||
|     os.makedirs(dirs.thumbnail_dir, exist_ok=True) |     os.makedirs(dirs.thumbnail_dir, exist_ok=True) | ||||||
|  |     os.makedirs(dirs.archive_dir, exist_ok=True) | ||||||
|  |  | ||||||
|     override_settings( |     override_settings( | ||||||
|         DATA_DIR=dirs.data_dir, |         DATA_DIR=dirs.data_dir, | ||||||
| @@ -28,6 +30,7 @@ def setup_directories(): | |||||||
|         MEDIA_ROOT=dirs.media_dir, |         MEDIA_ROOT=dirs.media_dir, | ||||||
|         ORIGINALS_DIR=dirs.originals_dir, |         ORIGINALS_DIR=dirs.originals_dir, | ||||||
|         THUMBNAIL_DIR=dirs.thumbnail_dir, |         THUMBNAIL_DIR=dirs.thumbnail_dir, | ||||||
|  |         ARCHIVE_DIR=dirs.archive_dir, | ||||||
|         CONSUMPTION_DIR=dirs.consumption_dir, |         CONSUMPTION_DIR=dirs.consumption_dir, | ||||||
|         INDEX_DIR=dirs.index_dir, |         INDEX_DIR=dirs.index_dir, | ||||||
|         MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle") |         MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle") | ||||||
|   | |||||||
| @@ -1,3 +1,5 @@ | |||||||
|  | import os | ||||||
|  |  | ||||||
| from django.db.models import Count, Max | from django.db.models import Count, Max | ||||||
| from django.http import HttpResponse, HttpResponseBadRequest, Http404 | from django.http import HttpResponse, HttpResponseBadRequest, Http404 | ||||||
| from django.views.decorators.cache import cache_control | from django.views.decorators.cache import cache_control | ||||||
| @@ -126,17 +128,30 @@ class DocumentViewSet(RetrieveModelMixin, | |||||||
|         index.remove_document_from_index(self.get_object()) |         index.remove_document_from_index(self.get_object()) | ||||||
|         return super(DocumentViewSet, self).destroy(request, *args, **kwargs) |         return super(DocumentViewSet, self).destroy(request, *args, **kwargs) | ||||||
|  |  | ||||||
|     def file_response(self, pk, disposition): |     @staticmethod | ||||||
|  |     def original_requested(request): | ||||||
|  |         return ( | ||||||
|  |             'original' in request.query_params and | ||||||
|  |             request.query_params['original'] == 'true' | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def file_response(self, pk, request, disposition): | ||||||
|         doc = Document.objects.get(id=pk) |         doc = Document.objects.get(id=pk) | ||||||
|  |         if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501 | ||||||
|         if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: |             file_handle = doc.archive_file | ||||||
|             file_handle = doc.source_file |             filename = doc.archive_file_name | ||||||
|  |             mime_type = 'application/pdf' | ||||||
|         else: |         else: | ||||||
|             file_handle = GnuPG.decrypted(doc.source_file) |             file_handle = doc.source_file | ||||||
|  |             filename = doc.file_name | ||||||
|  |             mime_type = doc.mime_type | ||||||
|  |  | ||||||
|         response = HttpResponse(file_handle, content_type=doc.mime_type) |         if doc.storage_type == Document.STORAGE_TYPE_GPG: | ||||||
|  |             file_handle = GnuPG.decrypted(file_handle) | ||||||
|  |  | ||||||
|  |         response = HttpResponse(file_handle, content_type=mime_type) | ||||||
|         response["Content-Disposition"] = '{}; filename="{}"'.format( |         response["Content-Disposition"] = '{}; filename="{}"'.format( | ||||||
|             disposition, doc.file_name) |             disposition, filename) | ||||||
|         return response |         return response | ||||||
|  |  | ||||||
|     @action(methods=['post'], detail=False) |     @action(methods=['post'], detail=False) | ||||||
| @@ -157,6 +172,8 @@ class DocumentViewSet(RetrieveModelMixin, | |||||||
|                 "paperless__checksum": doc.checksum, |                 "paperless__checksum": doc.checksum, | ||||||
|                 "paperless__mime_type": doc.mime_type, |                 "paperless__mime_type": doc.mime_type, | ||||||
|                 "paperless__filename": doc.filename, |                 "paperless__filename": doc.filename, | ||||||
|  |                 "paperless__has_archive_version": | ||||||
|  |                     os.path.isfile(doc.archive_path) | ||||||
|             }) |             }) | ||||||
|         except Document.DoesNotExist: |         except Document.DoesNotExist: | ||||||
|             raise Http404() |             raise Http404() | ||||||
| @@ -164,7 +181,8 @@ class DocumentViewSet(RetrieveModelMixin, | |||||||
|     @action(methods=['get'], detail=True) |     @action(methods=['get'], detail=True) | ||||||
|     def preview(self, request, pk=None): |     def preview(self, request, pk=None): | ||||||
|         try: |         try: | ||||||
|             response = self.file_response(pk, "inline") |             response = self.file_response( | ||||||
|  |                 pk, request, "inline") | ||||||
|             return response |             return response | ||||||
|         except (FileNotFoundError, Document.DoesNotExist): |         except (FileNotFoundError, Document.DoesNotExist): | ||||||
|             raise Http404() |             raise Http404() | ||||||
| @@ -181,7 +199,8 @@ class DocumentViewSet(RetrieveModelMixin, | |||||||
|     @action(methods=['get'], detail=True) |     @action(methods=['get'], detail=True) | ||||||
|     def download(self, request, pk=None): |     def download(self, request, pk=None): | ||||||
|         try: |         try: | ||||||
|             return self.file_response(pk, "attachment") |             return self.file_response( | ||||||
|  |                 pk, request, "attachment") | ||||||
|         except (FileNotFoundError, Document.DoesNotExist): |         except (FileNotFoundError, Document.DoesNotExist): | ||||||
|             raise Http404() |             raise Http404() | ||||||
|  |  | ||||||
|   | |||||||
| @@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs): | |||||||
|     binaries = ( |     binaries = ( | ||||||
|         settings.CONVERT_BINARY, |         settings.CONVERT_BINARY, | ||||||
|         settings.OPTIPNG_BINARY, |         settings.OPTIPNG_BINARY, | ||||||
|         settings.UNPAPER_BINARY, |  | ||||||
|         "tesseract" |         "tesseract" | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta | |||||||
|  |  | ||||||
| MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media")) | MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media")) | ||||||
| ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals") | ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals") | ||||||
|  | ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive") | ||||||
| THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") | THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") | ||||||
|  |  | ||||||
| DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) | DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) | ||||||
| @@ -348,9 +349,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) | |||||||
| # documents.  It should be a 3-letter language code consistent with ISO 639. | # documents.  It should be a 3-letter language code consistent with ISO 639. | ||||||
| OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | ||||||
|  |  | ||||||
|  | # OCRmyPDF --output-type options are available. | ||||||
|  | # TODO: validate this setting. | ||||||
|  | OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") | ||||||
|  |  | ||||||
| # OCR all documents? | # skip. redo, force | ||||||
| OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false") | # TODO: validate this. | ||||||
|  | OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | ||||||
|  |  | ||||||
|  | OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | ||||||
|  |  | ||||||
|  | OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") | ||||||
|  |  | ||||||
| # GNUPG needs a home directory for some reason | # GNUPG needs a home directory for some reason | ||||||
| GNUPG_HOME = os.getenv("HOME", "/tmp") | GNUPG_HOME = os.getenv("HOME", "/tmp") | ||||||
| @@ -359,11 +368,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp") | |||||||
| CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert") | CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert") | ||||||
| CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") | CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR") | ||||||
| CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") | CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT") | ||||||
| CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300)) |  | ||||||
|  |  | ||||||
| GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") | GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs") | ||||||
|  |  | ||||||
| OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") | OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng") | ||||||
| UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Pre-2.x versions of Paperless stored your documents locally with GPG | # Pre-2.x versions of Paperless stored your documents locally with GPG | ||||||
|   | |||||||
| @@ -14,12 +14,21 @@ def get_tesseract_langs(): | |||||||
|  |  | ||||||
| @register() | @register() | ||||||
| def check_default_language_available(app_configs, **kwargs): | def check_default_language_available(app_configs, **kwargs): | ||||||
|     langs = get_tesseract_langs() |     installed_langs = get_tesseract_langs() | ||||||
|  |  | ||||||
|     if settings.OCR_LANGUAGE not in langs: |     if not settings.OCR_LANGUAGE: | ||||||
|  |         return [Warning( | ||||||
|  |             "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " | ||||||
|  |             "This means that tesseract will fallback to english." | ||||||
|  |         )] | ||||||
|  |  | ||||||
|  |     specified_langs = settings.OCR_LANGUAGE.split("+") | ||||||
|  |  | ||||||
|  |     for lang in specified_langs: | ||||||
|  |         if lang not in installed_langs: | ||||||
|             return [Error( |             return [Error( | ||||||
|             f"The default ocr language {settings.OCR_LANGUAGE} is " |                 f"The selected ocr language {lang} is " | ||||||
|                 f"not installed. Paperless cannot OCR your documents " |                 f"not installed. Paperless cannot OCR your documents " | ||||||
|                 f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")] |                 f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")] | ||||||
|     else: |  | ||||||
|     return [] |     return [] | ||||||
|   | |||||||
| @@ -1,23 +1,15 @@ | |||||||
| import itertools | import json | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import subprocess | import subprocess | ||||||
| from multiprocessing.pool import ThreadPool |  | ||||||
|  |  | ||||||
| import langdetect | import ocrmypdf | ||||||
| import pdftotext | import pdftotext | ||||||
| import pyocr |  | ||||||
| from PIL import Image | from PIL import Image | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| from pyocr import PyocrException | from ocrmypdf import InputFileError | ||||||
|  |  | ||||||
| from documents.parsers import DocumentParser, ParseError, run_unpaper, \ | from documents.parsers import DocumentParser, ParseError, run_convert | ||||||
|     run_convert |  | ||||||
| from .languages import ISO639 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class OCRError(Exception): |  | ||||||
|     pass |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class RasterisedDocumentParser(DocumentParser): | class RasterisedDocumentParser(DocumentParser): | ||||||
| @@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) |     image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     def __init__(self, path, logging_group): |     def get_thumbnail(self, document_path, mime_type): | ||||||
|         super().__init__(path, logging_group) |  | ||||||
|         self._text = None |  | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |  | ||||||
|         """ |         """ | ||||||
|         The thumbnail of a PDF is just a 500px wide image of the first page. |         The thumbnail of a PDF is just a 500px wide image of the first page. | ||||||
|         """ |         """ | ||||||
| @@ -44,7 +32,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                         alpha="remove", |                         alpha="remove", | ||||||
|                         strip=True, |                         strip=True, | ||||||
|                         trim=True, |                         trim=True, | ||||||
|                         input_file="{}[0]".format(self.document_path), |                         input_file="{}[0]".format(document_path), | ||||||
|                         output_file=out_path, |                         output_file=out_path, | ||||||
|                         logging_group=self.logging_group) |                         logging_group=self.logging_group) | ||||||
|         except ParseError: |         except ParseError: | ||||||
| @@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                    "-q", |                    "-q", | ||||||
|                    "-sDEVICE=pngalpha", |                    "-sDEVICE=pngalpha", | ||||||
|                    "-o", gs_out_path, |                    "-o", gs_out_path, | ||||||
|                    self.document_path] |                    document_path] | ||||||
|             if not subprocess.Popen(cmd).wait() == 0: |             if not subprocess.Popen(cmd).wait() == 0: | ||||||
|                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) |                 raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) | ||||||
|             # then run convert on the output from gs |             # then run convert on the output from gs | ||||||
| @@ -74,169 +62,126 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|  |  | ||||||
|         return out_path |         return out_path | ||||||
|  |  | ||||||
|     def _is_ocred(self): |     def is_image(self, mime_type): | ||||||
|  |         return mime_type in [ | ||||||
|         # Extract text from PDF using pdftotext |             "image/png", | ||||||
|         text = get_text_from_pdf(self.document_path) |             "image/jpeg" | ||||||
|  |         ] | ||||||
|         # We assume, that a PDF with at least 50 characters contains text |  | ||||||
|         # (so no OCR required) |  | ||||||
|         return len(text) > 50 |  | ||||||
|  |  | ||||||
|     def get_text(self): |  | ||||||
|  |  | ||||||
|         if self._text is not None: |  | ||||||
|             return self._text |  | ||||||
|  |  | ||||||
|         if not settings.OCR_ALWAYS and self._is_ocred(): |  | ||||||
|             self.log("debug", "Skipping OCR, using Text from PDF") |  | ||||||
|             self._text = get_text_from_pdf(self.document_path) |  | ||||||
|             return self._text |  | ||||||
|  |  | ||||||
|         images = self._get_greyscale() |  | ||||||
|  |  | ||||||
|         if not images: |  | ||||||
|             raise ParseError("Empty document, nothing to do.") |  | ||||||
|  |  | ||||||
|  |     def get_dpi(self, image): | ||||||
|         try: |         try: | ||||||
|  |             with Image.open(image) as im: | ||||||
|             sample_page_index = int(len(images) / 2) |                 x, y = im.info['dpi'] | ||||||
|             self.log( |                 return x | ||||||
|                 "debug", |  | ||||||
|                 f"Attempting language detection on page " |  | ||||||
|                 f"{sample_page_index + 1} of {len(images)}...") |  | ||||||
|  |  | ||||||
|             sample_page_text = self._ocr([images[sample_page_index]], |  | ||||||
|                                          settings.OCR_LANGUAGE)[0] |  | ||||||
|             guessed_language = self._guess_language(sample_page_text) |  | ||||||
|  |  | ||||||
|             if not guessed_language or guessed_language not in ISO639: |  | ||||||
|                 self.log("warning", "Language detection failed.") |  | ||||||
|                 ocr_pages = self._complete_ocr_default_language( |  | ||||||
|                     images, sample_page_index, sample_page_text) |  | ||||||
|  |  | ||||||
|             elif ISO639[guessed_language] == settings.OCR_LANGUAGE: |  | ||||||
|                 self.log( |  | ||||||
|                     "debug", |  | ||||||
|                     f"Detected language: {guessed_language} " |  | ||||||
|                     f"(default language)") |  | ||||||
|                 ocr_pages = self._complete_ocr_default_language( |  | ||||||
|                     images, sample_page_index, sample_page_text) |  | ||||||
|  |  | ||||||
|             elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501 |  | ||||||
|                 self.log( |  | ||||||
|                     "warning", |  | ||||||
|                     f"Detected language {guessed_language} is not available " |  | ||||||
|                     f"on this system.") |  | ||||||
|                 ocr_pages = self._complete_ocr_default_language( |  | ||||||
|                     images, sample_page_index, sample_page_text) |  | ||||||
|  |  | ||||||
|             else: |  | ||||||
|                 self.log("debug", f"Detected language: {guessed_language}") |  | ||||||
|                 ocr_pages = self._ocr(images, ISO639[guessed_language]) |  | ||||||
|  |  | ||||||
|             self.log("debug", "OCR completed.") |  | ||||||
|             self._text = strip_excess_whitespace(" ".join(ocr_pages)) |  | ||||||
|             return self._text |  | ||||||
|  |  | ||||||
|         except OCRError as e: |  | ||||||
|             raise ParseError(e) |  | ||||||
|  |  | ||||||
|     def _get_greyscale(self): |  | ||||||
|         """ |  | ||||||
|         Greyscale images are easier for Tesseract to OCR |  | ||||||
|         """ |  | ||||||
|  |  | ||||||
|         # Convert PDF to multiple PNMs |  | ||||||
|         input_file = self.document_path |  | ||||||
|  |  | ||||||
|         if settings.OCR_PAGES == 1: |  | ||||||
|             input_file += "[0]" |  | ||||||
|         elif settings.OCR_PAGES > 1: |  | ||||||
|             input_file += f"[0-{settings.OCR_PAGES - 1}]" |  | ||||||
|  |  | ||||||
|         self.log( |  | ||||||
|             "debug", |  | ||||||
|             f"Converting document {input_file} into greyscale images") |  | ||||||
|  |  | ||||||
|         output_files = os.path.join(self.tempdir, "convert-%04d.pnm") |  | ||||||
|  |  | ||||||
|         run_convert(density=settings.CONVERT_DENSITY, |  | ||||||
|                     depth="8", |  | ||||||
|                     type="grayscale", |  | ||||||
|                     input_file=input_file, |  | ||||||
|                     output_file=output_files, |  | ||||||
|                     logging_group=self.logging_group) |  | ||||||
|  |  | ||||||
|         # Get a list of converted images |  | ||||||
|         pnms = [] |  | ||||||
|         for f in os.listdir(self.tempdir): |  | ||||||
|             if f.endswith(".pnm"): |  | ||||||
|                 pnms.append(os.path.join(self.tempdir, f)) |  | ||||||
|  |  | ||||||
|         self.log("debug", f"Running unpaper on {len(pnms)} pages...") |  | ||||||
|  |  | ||||||
|         # Run unpaper in parallel on converted images |  | ||||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: |  | ||||||
|             pnms = pool.map(run_unpaper, pnms) |  | ||||||
|  |  | ||||||
|         return sorted(filter(lambda __: os.path.isfile(__), pnms)) |  | ||||||
|  |  | ||||||
|     def _guess_language(self, text): |  | ||||||
|         try: |  | ||||||
|             guess = langdetect.detect(text) |  | ||||||
|             return guess |  | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             self.log('warning', f"Language detection failed with: {e}") |             self.log( | ||||||
|  |                 'warning', | ||||||
|  |                 f"Error while getting DPI from image {image}: {e}") | ||||||
|             return None |             return None | ||||||
|  |  | ||||||
|     def _ocr(self, imgs, lang): |     def parse(self, document_path, mime_type): | ||||||
|  |         if settings.OCR_MODE == "skip_noarchive": | ||||||
|  |             text = get_text_from_pdf(document_path) | ||||||
|  |             if text and len(text) > 50: | ||||||
|  |                 self.text = text | ||||||
|  |                 return | ||||||
|  |  | ||||||
|  |         archive_path = os.path.join(self.tempdir, "archive.pdf") | ||||||
|  |  | ||||||
|  |         ocr_args = { | ||||||
|  |             'input_file': document_path, | ||||||
|  |             'output_file': archive_path, | ||||||
|  |             'use_threads': True, | ||||||
|  |             'jobs': settings.THREADS_PER_WORKER, | ||||||
|  |             'language': settings.OCR_LANGUAGE, | ||||||
|  |             'output_type': settings.OCR_OUTPUT_TYPE, | ||||||
|  |             'progress_bar': False, | ||||||
|  |             'clean': True | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if settings.OCR_PAGES > 0: | ||||||
|  |             ocr_args['pages'] = f"1-{settings.OCR_PAGES}" | ||||||
|  |  | ||||||
|  |         if settings.OCR_MODE in ['skip', 'skip_noarchive']: | ||||||
|  |             ocr_args['skip_text'] = True | ||||||
|  |         elif settings.OCR_MODE == 'redo': | ||||||
|  |             ocr_args['redo_ocr'] = True | ||||||
|  |         elif settings.OCR_MODE == 'force': | ||||||
|  |             ocr_args['force_ocr'] = True | ||||||
|  |  | ||||||
|  |         if self.is_image(mime_type): | ||||||
|  |             dpi = self.get_dpi(document_path) | ||||||
|  |             if dpi: | ||||||
|                 self.log( |                 self.log( | ||||||
|                     "debug", |                     "debug", | ||||||
|             f"Performing OCR on {len(imgs)} page(s) with language {lang}") |                     f"Detected DPI for image {document_path}: {dpi}" | ||||||
|         with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: |                 ) | ||||||
|             r = pool.map(image_to_string, itertools.product(imgs, [lang])) |                 ocr_args['image_dpi'] = dpi | ||||||
|             return r |             elif settings.OCR_IMAGE_DPI: | ||||||
|  |                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||||
|     def _complete_ocr_default_language(self, |  | ||||||
|                                        images, |  | ||||||
|                                        sample_page_index, |  | ||||||
|                                        sample_page): |  | ||||||
|         images_copy = list(images) |  | ||||||
|         del images_copy[sample_page_index] |  | ||||||
|         if images_copy: |  | ||||||
|             self.log('debug', "Continuing ocr with default language.") |  | ||||||
|             ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) |  | ||||||
|             ocr_pages.insert(sample_page_index, sample_page) |  | ||||||
|             return ocr_pages |  | ||||||
|             else: |             else: | ||||||
|             return [sample_page] |                 raise ParseError( | ||||||
|  |                     f"Cannot produce archive PDF for image {document_path}, " | ||||||
|  |                     f"no DPI information is present in this image and " | ||||||
|  |                     f"OCR_IMAGE_DPI is not set.") | ||||||
|  |  | ||||||
|  |         if settings.OCR_USER_ARGS: | ||||||
|  |             try: | ||||||
|  |                 user_args = json.loads(settings.OCR_USER_ARGS) | ||||||
|  |                 ocr_args = {**ocr_args, **user_args} | ||||||
|  |             except Exception as e: | ||||||
|  |                 self.log( | ||||||
|  |                     "warning", | ||||||
|  |                     f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " | ||||||
|  |                     f"they will not be used: {e}") | ||||||
|  |  | ||||||
|  |         # This forces tesseract to use one core per page. | ||||||
|  |         os.environ['OMP_THREAD_LIMIT'] = "1" | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             self.log("debug", | ||||||
|  |                      f"Calling OCRmyPDF with {str(ocr_args)}") | ||||||
|  |             ocrmypdf.ocr(**ocr_args) | ||||||
|  |             # success! announce results | ||||||
|  |             self.archive_path = archive_path | ||||||
|  |             self.text = get_text_from_pdf(archive_path) | ||||||
|  |  | ||||||
|  |         except InputFileError as e: | ||||||
|  |             # This happens with some PDFs when used with the redo_ocr option. | ||||||
|  |             # This is not the end of the world, we'll just use what we already | ||||||
|  |             # have in the document. | ||||||
|  |             self.text = get_text_from_pdf(document_path) | ||||||
|  |             # Also, no archived file. | ||||||
|  |             if not self.text: | ||||||
|  |                 # However, if we don't have anything, fail: | ||||||
|  |                 raise ParseError(e) | ||||||
|  |  | ||||||
|  |         except Exception as e: | ||||||
|  |             # Anything else is probably serious. | ||||||
|  |             raise ParseError(e) | ||||||
|  |  | ||||||
|  |         if not self.text: | ||||||
|  |             # This may happen for files that don't have any text. | ||||||
|  |             self.log( | ||||||
|  |                 'warning', | ||||||
|  |                 f"Document {document_path} does not have any text." | ||||||
|  |                 f"This is probably an error or you tried to add an image " | ||||||
|  |                 f"without text.") | ||||||
|  |             self.text = "" | ||||||
|  |  | ||||||
|  |  | ||||||
| def strip_excess_whitespace(text): | def strip_excess_whitespace(text): | ||||||
|  |     if not text: | ||||||
|  |         return None | ||||||
|  |  | ||||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) |     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||||
|     no_leading_whitespace = re.sub( |     no_leading_whitespace = re.sub( | ||||||
|         r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) |         r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||||
|     no_trailing_whitespace = re.sub( |     no_trailing_whitespace = re.sub( | ||||||
|         r"([^\S\n\r]+)$", '', no_leading_whitespace) |         r"([^\S\n\r]+)$", '', no_leading_whitespace) | ||||||
|     return no_trailing_whitespace |  | ||||||
|  |  | ||||||
|  |     # TODO: this needs a rework | ||||||
| def image_to_string(args): |     return no_trailing_whitespace.strip() | ||||||
|     img, lang = args |  | ||||||
|     ocr = pyocr.get_available_tools()[0] |  | ||||||
|     with Image.open(img) as f: |  | ||||||
|         if ocr.can_detect_orientation(): |  | ||||||
|             try: |  | ||||||
|                 orientation = ocr.detect_orientation(f, lang=lang) |  | ||||||
|                 f = f.rotate(orientation["angle"], expand=1) |  | ||||||
|             except Exception: |  | ||||||
|                 # Rotation not possible, ignore |  | ||||||
|                 pass |  | ||||||
|         try: |  | ||||||
|             return ocr.image_to_string(f, lang=lang) |  | ||||||
|         except PyocrException as e: |  | ||||||
|             raise OCRError(e) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_text_from_pdf(pdf_file): | def get_text_from_pdf(pdf_file): | ||||||
| @@ -245,6 +190,9 @@ def get_text_from_pdf(pdf_file): | |||||||
|         try: |         try: | ||||||
|             pdf = pdftotext.PDF(f) |             pdf = pdftotext.PDF(f) | ||||||
|         except pdftotext.Error: |         except pdftotext.Error: | ||||||
|             return "" |             # might not be a PDF file | ||||||
|  |             return None | ||||||
|  |  | ||||||
|     return "\n".join(pdf) |     text = "\n".join(pdf) | ||||||
|  |  | ||||||
|  |     return strip_excess_whitespace(text) | ||||||
|   | |||||||
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/multi-page-digital.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/multi-page-images.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/no-text-alpha.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 32 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple-alpha.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 8.2 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/simple-no-dpi.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						| After Width: | Height: | Size: 6.8 KiB | 
| Before Width: | Height: | Size: 7.7 KiB After Width: | Height: | Size: 7.2 KiB | 
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/with-form.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						| @@ -1,193 +0,0 @@ | |||||||
| import datetime |  | ||||||
| import os |  | ||||||
| import shutil |  | ||||||
| from unittest import mock |  | ||||||
| from uuid import uuid4 |  | ||||||
|  |  | ||||||
| from dateutil import tz |  | ||||||
| from django.conf import settings |  | ||||||
| from django.test import TestCase, override_settings |  | ||||||
|  |  | ||||||
| from ..parsers import RasterisedDocumentParser |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestDate(TestCase): |  | ||||||
|  |  | ||||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") |  | ||||||
|     SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8]) |  | ||||||
|  |  | ||||||
|     def setUp(self): |  | ||||||
|         os.makedirs(self.SCRATCH, exist_ok=True) |  | ||||||
|  |  | ||||||
|     def tearDown(self): |  | ||||||
|         shutil.rmtree(self.SCRATCH) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_1(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = "lorem ipsum 130218 lorem ipsum" |  | ||||||
|         self.assertEqual(document.get_date(), None) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_2(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = "lorem ipsum 2018 lorem ipsum" |  | ||||||
|         self.assertEqual(document.get_date(), None) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_3(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = "lorem ipsum 20180213 lorem ipsum" |  | ||||||
|         self.assertEqual(document.get_date(), None) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_4(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = "lorem ipsum 13.02.2018 lorem ipsum" |  | ||||||
|         date = document.get_date() |  | ||||||
|         self.assertEqual( |  | ||||||
|             date, |  | ||||||
|             datetime.datetime( |  | ||||||
|                 2018, 2, 13, 0, 0, |  | ||||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) |  | ||||||
|             ) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_5(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = ( |  | ||||||
|             "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem " |  | ||||||
|             "ipsum" |  | ||||||
|         ) |  | ||||||
|         date = document.get_date() |  | ||||||
|         self.assertEqual( |  | ||||||
|             date, |  | ||||||
|             datetime.datetime( |  | ||||||
|                 2018, 2, 13, 0, 0, |  | ||||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) |  | ||||||
|             ) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_6(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = ( |  | ||||||
|             "lorem ipsum\n" |  | ||||||
|             "Wohnort\n" |  | ||||||
|             "3100\n" |  | ||||||
|             "IBAN\n" |  | ||||||
|             "AT87 4534\n" |  | ||||||
|             "1234\n" |  | ||||||
|             "1234 5678\n" |  | ||||||
|             "BIC\n" |  | ||||||
|             "lorem ipsum" |  | ||||||
|         ) |  | ||||||
|         self.assertEqual(document.get_date(), None) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_7(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = ( |  | ||||||
|             "lorem ipsum\n" |  | ||||||
|             "März 2019\n" |  | ||||||
|             "lorem ipsum" |  | ||||||
|         ) |  | ||||||
|         date = document.get_date() |  | ||||||
|         self.assertEqual( |  | ||||||
|             date, |  | ||||||
|             datetime.datetime( |  | ||||||
|                 2019, 3, 1, 0, 0, |  | ||||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) |  | ||||||
|             ) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_8(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = ( |  | ||||||
|             "lorem ipsum\n" |  | ||||||
|             "Wohnort\n" |  | ||||||
|             "3100\n" |  | ||||||
|             "IBAN\n" |  | ||||||
|             "AT87 4534\n" |  | ||||||
|             "1234\n" |  | ||||||
|             "1234 5678\n" |  | ||||||
|             "BIC\n" |  | ||||||
|             "lorem ipsum\n" |  | ||||||
|             "März 2020" |  | ||||||
|         ) |  | ||||||
|         self.assertEqual( |  | ||||||
|             document.get_date(), |  | ||||||
|             datetime.datetime( |  | ||||||
|                 2020, 3, 1, 0, 0, |  | ||||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) |  | ||||||
|             ) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_date_format_9(self): |  | ||||||
|         input_file = os.path.join(self.SAMPLE_FILES, "") |  | ||||||
|         document = RasterisedDocumentParser(input_file, None) |  | ||||||
|         document._text = ( |  | ||||||
|             "lorem ipsum\n" |  | ||||||
|             "27. Nullmonth 2020\n" |  | ||||||
|             "März 2020\n" |  | ||||||
|             "lorem ipsum" |  | ||||||
|         ) |  | ||||||
|         self.assertEqual( |  | ||||||
|             document.get_date(), |  | ||||||
|             datetime.datetime( |  | ||||||
|                 2020, 3, 1, 0, 0, |  | ||||||
|                 tzinfo=tz.gettz(settings.TIME_ZONE) |  | ||||||
|             ) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     @mock.patch( |  | ||||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", |  | ||||||
|         return_value="01-07-0590 00:00:00" |  | ||||||
|     ) |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_crazy_date_past(self, *args): |  | ||||||
|         document = RasterisedDocumentParser("/dev/null", None) |  | ||||||
|         document.get_text() |  | ||||||
|         self.assertIsNone(document.get_date()) |  | ||||||
|  |  | ||||||
|     @mock.patch( |  | ||||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", |  | ||||||
|         return_value="01-07-2350 00:00:00" |  | ||||||
|     ) |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_crazy_date_future(self, *args): |  | ||||||
|         document = RasterisedDocumentParser("/dev/null", None) |  | ||||||
|         document.get_text() |  | ||||||
|         self.assertIsNone(document.get_date()) |  | ||||||
|  |  | ||||||
|     @mock.patch( |  | ||||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", |  | ||||||
|         return_value="20 408000l 2475" |  | ||||||
|     ) |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_crazy_date_with_spaces(self, *args): |  | ||||||
|         document = RasterisedDocumentParser("/dev/null", None) |  | ||||||
|         document.get_text() |  | ||||||
|         self.assertIsNone(document.get_date()) |  | ||||||
|  |  | ||||||
|     @mock.patch( |  | ||||||
|         "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", |  | ||||||
|         return_value="No date in here" |  | ||||||
|     ) |  | ||||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") |  | ||||||
|     @override_settings(SCRATCH_DIR=SCRATCH) |  | ||||||
|     def test_filename_date_parse_invalid(self, *args): |  | ||||||
|         document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None) |  | ||||||
|         document.get_text() |  | ||||||
|         self.assertIsNone(document.get_date()) |  | ||||||
| @@ -1,76 +0,0 @@ | |||||||
| import os |  | ||||||
| from unittest import mock, skipIf |  | ||||||
|  |  | ||||||
| import pyocr |  | ||||||
| from django.test import TestCase |  | ||||||
| from pyocr.libtesseract.tesseract_raw import \ |  | ||||||
|     TesseractError as OtherTesseractError |  | ||||||
|  |  | ||||||
| from ..parsers import image_to_string, strip_excess_whitespace |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FakeTesseract(object): |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def can_detect_orientation(): |  | ||||||
|         return True |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def detect_orientation(file_handle, lang): |  | ||||||
|         raise OtherTesseractError("arbitrary status", "message") |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def image_to_string(file_handle, lang): |  | ||||||
|         return "This is test text" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FakePyOcr(object): |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def get_available_tools(): |  | ||||||
|         return [FakeTesseract] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestOCR(TestCase): |  | ||||||
|  |  | ||||||
|     text_cases = [ |  | ||||||
|         ("simple     string", "simple string"), |  | ||||||
|         ( |  | ||||||
|             "simple    newline\n   testing string", |  | ||||||
|             "simple newline\ntesting string" |  | ||||||
|         ), |  | ||||||
|         ( |  | ||||||
|             "utf-8   строка с пробелами в конце  ", |  | ||||||
|             "utf-8 строка с пробелами в конце" |  | ||||||
|         ) |  | ||||||
|     ] |  | ||||||
|  |  | ||||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") |  | ||||||
|     TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) |  | ||||||
|  |  | ||||||
|     def test_strip_excess_whitespace(self): |  | ||||||
|         for source, result in self.text_cases: |  | ||||||
|             actual_result = strip_excess_whitespace(source) |  | ||||||
|             self.assertEqual( |  | ||||||
|                 result, |  | ||||||
|                 actual_result, |  | ||||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( |  | ||||||
|                     source, |  | ||||||
|                     result, |  | ||||||
|                     actual_result |  | ||||||
|                 ) |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) |  | ||||||
|     def test_image_to_string_with_text_free_page(self): |  | ||||||
|         """ |  | ||||||
|         This test is sort of silly, since it's really just reproducing an odd |  | ||||||
|         exception thrown by pyocr when it encounters a page with no text. |  | ||||||
|         Actually running this test against an installation of Tesseract results |  | ||||||
|         in a segmentation fault rooted somewhere deep inside pyocr where I |  | ||||||
|         don't care to dig.  Regardless, if you run the consumer normally, |  | ||||||
|         text-free pages are now handled correctly so long as we work around |  | ||||||
|         this weird exception. |  | ||||||
|         """ |  | ||||||
|         image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"]) |  | ||||||
| @@ -1,46 +1,17 @@ | |||||||
| import os | import os | ||||||
| import shutil |  | ||||||
| import tempfile |  | ||||||
| import uuid | import uuid | ||||||
| from typing import ContextManager | from typing import ContextManager | ||||||
| from unittest import mock | from unittest import mock | ||||||
|  |  | ||||||
| from django.test import TestCase, override_settings | from django.test import TestCase, override_settings | ||||||
| from pyocr.error import TesseractError |  | ||||||
|  |  | ||||||
| from documents.parsers import ParseError, run_convert | from documents.parsers import ParseError, run_convert | ||||||
| from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError | from documents.tests.utils import DirectoriesMixin | ||||||
|  | from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace | ||||||
|  |  | ||||||
| image_to_string_calls = [] | image_to_string_calls = [] | ||||||
|  |  | ||||||
|  |  | ||||||
| class FakeTesseract(object): |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def can_detect_orientation(): |  | ||||||
|         return True |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def detect_orientation(file_handle, lang): |  | ||||||
|         raise TesseractError("arbitrary status", "message") |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def get_available_languages(): |  | ||||||
|         return ['eng', 'deu'] |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def image_to_string(file_handle, lang): |  | ||||||
|         image_to_string_calls.append((file_handle.name, lang)) |  | ||||||
|         return file_handle.read() |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FakePyOcr(object): |  | ||||||
|  |  | ||||||
|     @staticmethod |  | ||||||
|     def get_available_tools(): |  | ||||||
|         return [FakeTesseract] |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def fake_convert(input_file, output_file, **kwargs): | def fake_convert(input_file, output_file, **kwargs): | ||||||
|     with open(input_file) as f: |     with open(input_file) as f: | ||||||
|         lines = f.readlines() |         lines = f.readlines() | ||||||
| @@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs): | |||||||
|             f2.write(line.strip()) |             f2.write(line.strip()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def fake_unpaper(pnm): |  | ||||||
|     output = pnm + ".unpaper.pnm" |  | ||||||
|     shutil.copy(pnm, output) |  | ||||||
|     return output |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class FakeImageFile(ContextManager): | class FakeImageFile(ContextManager): | ||||||
|     def __init__(self, fname): |     def __init__(self, fname): | ||||||
|         self.fname = fname |         self.fname = fname | ||||||
| @@ -67,142 +32,50 @@ class FakeImageFile(ContextManager): | |||||||
|         return os.path.basename(self.fname) |         return os.path.basename(self.fname) | ||||||
|  |  | ||||||
|  |  | ||||||
| fake_image = FakeImageFile |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) | class TestParser(DirectoriesMixin, TestCase): | ||||||
| @mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) |  | ||||||
| @mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) |  | ||||||
| @mock.patch("paperless_tesseract.parsers.Image.open", open) |  | ||||||
| class TestRasterisedDocumentParser(TestCase): |  | ||||||
|  |  | ||||||
|     def setUp(self): |     def assertContainsStrings(self, content, strings): | ||||||
|         self.scratch = tempfile.mkdtemp() |         # Asserts that all strings appear in content, in the given order. | ||||||
|  |         indices = [content.index(s) for s in strings] | ||||||
|  |         self.assertListEqual(indices, sorted(indices)) | ||||||
|  |  | ||||||
|         global image_to_string_calls |     text_cases = [ | ||||||
|  |         ("simple     string", "simple string"), | ||||||
|  |         ( | ||||||
|  |             "simple    newline\n   testing string", | ||||||
|  |             "simple newline\ntesting string" | ||||||
|  |         ), | ||||||
|  |         ( | ||||||
|  |             "utf-8   строка с пробелами в конце  ", | ||||||
|  |             "utf-8 строка с пробелами в конце" | ||||||
|  |         ) | ||||||
|  |     ] | ||||||
|  |  | ||||||
|         image_to_string_calls = [] |     def test_strip_excess_whitespace(self): | ||||||
|  |         for source, result in self.text_cases: | ||||||
|         override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() |             actual_result = strip_excess_whitespace(source) | ||||||
|  |             self.assertEqual( | ||||||
|     def tearDown(self): |                 result, | ||||||
|         shutil.rmtree(self.scratch) |                 actual_result, | ||||||
|  |                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||||
|     def get_input_file(self, pages): |                     source, | ||||||
|         _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) |                     result, | ||||||
|         with open(fname, "w") as f: |                     actual_result | ||||||
|             f.writelines([f"line {p}\n" for p in range(pages)]) |                 ) | ||||||
|         return fname |             ) | ||||||
|  |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") |  | ||||||
|     def test_parse_text_simple_language_match(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0") |  | ||||||
|  |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) |  | ||||||
|  |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") |  | ||||||
|     def test_parse_text_2_pages(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1") |  | ||||||
|  |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) |  | ||||||
|  |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") |  | ||||||
|     def test_parse_text_3_pages(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1 line 2") |  | ||||||
|  |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) |  | ||||||
|  |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) |  | ||||||
|     def test_parse_text_lang_detect_failed(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1 line 2") |  | ||||||
|  |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) |  | ||||||
|  |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") |  | ||||||
|     def test_parse_text_lang_not_installed(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1 line 2 line 3") |  | ||||||
|  |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) |  | ||||||
|  |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") |  | ||||||
|     def test_parse_text_lang_mismatch(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) |  | ||||||
|         text = parser.get_text() |  | ||||||
|         self.assertEqual(text, "line 0 line 1 line 2") |  | ||||||
|  |  | ||||||
|         self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) |  | ||||||
|  |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") |  | ||||||
|     def test_parse_empty_doc(self): |  | ||||||
|         parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) |  | ||||||
|         try: |  | ||||||
|             parser.get_text() |  | ||||||
|         except ParseError as e: |  | ||||||
|             self.assertEqual("Empty document, nothing to do.", str(e)) |  | ||||||
|         else: |  | ||||||
|             self.fail("Should raise exception") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestAuxilliaryFunctions(TestCase): |  | ||||||
|  |  | ||||||
|     def setUp(self): |  | ||||||
|         self.scratch = tempfile.mkdtemp() |  | ||||||
|  |  | ||||||
|         override_settings(SCRATCH_DIR=self.scratch).enable() |  | ||||||
|  |  | ||||||
|     def tearDown(self): |  | ||||||
|         shutil.rmtree(self.scratch) |  | ||||||
|  |  | ||||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") |     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||||
|  |  | ||||||
|     def test_get_text_from_pdf(self): |     def test_get_text_from_pdf(self): | ||||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf')) |         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf')) | ||||||
|  |  | ||||||
|         self.assertEqual(text.strip(), "This is a test document.") |         self.assertContainsStrings(text.strip(), ["This is a test document."]) | ||||||
|  |  | ||||||
|     def test_get_text_from_pdf_error(self): |  | ||||||
|         text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) |  | ||||||
|  |  | ||||||
|         self.assertEqual(text.strip(), "") |  | ||||||
|  |  | ||||||
|     def test_image_to_string(self): |  | ||||||
|         text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) |  | ||||||
|  |  | ||||||
|         self.assertEqual(text, "This is a test document.") |  | ||||||
|  |  | ||||||
|     def test_image_to_string_language_unavailable(self): |  | ||||||
|         try: |  | ||||||
|             image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) |  | ||||||
|         except OCRError as e: |  | ||||||
|             self.assertTrue("Failed loading language" in str(e)) |  | ||||||
|         else: |  | ||||||
|             self.fail("Should raise exception") |  | ||||||
|  |  | ||||||
|     @override_settings(OCR_ALWAYS=False) |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") |  | ||||||
|     def test_is_ocred(self, m2, m): |  | ||||||
|         parser = RasterisedDocumentParser("", uuid.uuid4()) |  | ||||||
|         m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ |  | ||||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " \ |  | ||||||
|                          "lots of text lots of text lots of text lots of text lots of text lots of text " |  | ||||||
|         parser.get_text() |  | ||||||
|         self.assertEqual(m.call_count, 2) |  | ||||||
|         self.assertEqual(m2.call_count, 0) |  | ||||||
|  |  | ||||||
|     def test_thumbnail(self): |     def test_thumbnail(self): | ||||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) |         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||||
|         parser.get_thumbnail() |         parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") | ||||||
|         # dont really know how to test it, just call it and assert that it does not raise anything. |         # dont really know how to test it, just call it and assert that it does not raise anything. | ||||||
|  |  | ||||||
|     @mock.patch("paperless_tesseract.parsers.run_convert") |     @mock.patch("paperless_tesseract.parsers.run_convert") | ||||||
| @@ -216,6 +89,161 @@ class TestAuxilliaryFunctions(TestCase): | |||||||
|  |  | ||||||
|         m.side_effect = call_convert |         m.side_effect = call_convert | ||||||
|  |  | ||||||
|         parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) |         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||||
|         parser.get_thumbnail() |         parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") | ||||||
|         # dont really know how to test it, just call it and assert that it does not raise anything. |         # dont really know how to test it, just call it and assert that it does not raise anything. | ||||||
|  |  | ||||||
|  |     def test_get_dpi(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) | ||||||
|  |         self.assertEqual(dpi, None) | ||||||
|  |  | ||||||
|  |         dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png")) | ||||||
|  |         self.assertEqual(dpi, 72) | ||||||
|  |  | ||||||
|  |     def test_simple_digital(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |  | ||||||
|  |         self.assertContainsStrings(parser.get_text(), ["This is a test document."]) | ||||||
|  |  | ||||||
|  |     def test_with_form(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |  | ||||||
|  |         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_MODE="redo") | ||||||
|  |     def test_with_form_error(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||||
|  |  | ||||||
|  |         self.assertIsNone(parser.archive_path) | ||||||
|  |         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_MODE="redo") | ||||||
|  |     @mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None) | ||||||
|  |     def test_with_form_error_notext(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         def f(): | ||||||
|  |             parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||||
|  |  | ||||||
|  |         self.assertRaises(ParseError, f) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_MODE="force") | ||||||
|  |     def test_with_form_force(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||||
|  |  | ||||||
|  |         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||||
|  |  | ||||||
|  |     def test_image_simple(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |  | ||||||
|  |         self.assertContainsStrings(parser.get_text(), ["This is a test document."]) | ||||||
|  |  | ||||||
|  |     def test_image_simple_alpha_fail(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         def f(): | ||||||
|  |             parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png") | ||||||
|  |  | ||||||
|  |         self.assertRaises(ParseError, f) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def test_image_no_dpi_fail(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         def f(): | ||||||
|  |             parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||||
|  |  | ||||||
|  |         self.assertRaises(ParseError, f) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_IMAGE_DPI=72) | ||||||
|  |     def test_image_no_dpi_default(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |  | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||||
|  |  | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |  | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."]) | ||||||
|  |  | ||||||
|  |     def test_multi_page(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_PAGES=2, OCR_MODE="skip") | ||||||
|  |     def test_multi_page_pages_skip(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||||
|  |     def test_multi_page_pages_redo(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_PAGES=2, OCR_MODE="force") | ||||||
|  |     def test_multi_page_pages_force(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||||
|  |  | ||||||
|  |     @override_settings(OOCR_MODE="skip") | ||||||
|  |     def test_multi_page_analog_pages_skip(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||||
|  |     def test_multi_page_analog_pages_redo(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) | ||||||
|  |         self.assertFalse("page 3" in parser.get_text().lower()) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_PAGES=1, OCR_MODE="force") | ||||||
|  |     def test_multi_page_analog_pages_force(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||||
|  |         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1"]) | ||||||
|  |         self.assertFalse("page 2" in parser.get_text().lower()) | ||||||
|  |         self.assertFalse("page 3" in parser.get_text().lower()) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_MODE="skip_noarchive") | ||||||
|  |     def test_skip_noarchive_withtext(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||||
|  |         self.assertIsNone(parser.archive_path) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||||
|  |  | ||||||
|  |     @override_settings(OCR_MODE="skip_noarchive") | ||||||
|  |     def test_skip_noarchive_notext(self): | ||||||
|  |         parser = RasterisedDocumentParser(None) | ||||||
|  |         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||||
|  |         self.assertTrue(os.path.join(parser.archive_path)) | ||||||
|  |         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||||
|   | |||||||
| @@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser): | |||||||
|     This parser directly parses a text document (.txt, .md, or .csv) |     This parser directly parses a text document (.txt, .md, or .csv) | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     def __init__(self, path, logging_group): |     def get_thumbnail(self, document_path, mime_type): | ||||||
|         super().__init__(path, logging_group) |  | ||||||
|         self._text = None |  | ||||||
|  |  | ||||||
|     def get_thumbnail(self): |  | ||||||
|         """ |         """ | ||||||
|         The thumbnail of a text file is just a 500px wide image of the text |         The thumbnail of a text file is just a 500px wide image of the text | ||||||
|         rendered onto a letter-sized page. |         rendered onto a letter-sized page. | ||||||
| @@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser): | |||||||
|             ) |             ) | ||||||
|  |  | ||||||
|         def read_text(): |         def read_text(): | ||||||
|             with open(self.document_path, 'r') as src: |             with open(document_path, 'r') as src: | ||||||
|                 lines = [line.strip() for line in src.readlines()] |                 lines = [line.strip() for line in src.readlines()] | ||||||
|                 text = "\n".join([line for line in lines[:n_lines]]) |                 text = "\n".join([line for line in lines[:n_lines]]) | ||||||
|                 return text.replace('"', "'") |                 return text.replace('"', "'") | ||||||
| @@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser): | |||||||
|  |  | ||||||
|         return out_path |         return out_path | ||||||
|  |  | ||||||
|     def get_text(self): |     def parse(self, document_path, mime_type): | ||||||
|  |         with open(document_path, 'r') as f: | ||||||
|         if self._text is not None: |             self.text = f.read() | ||||||
|             return self._text |  | ||||||
|  |  | ||||||
|         with open(self.document_path, 'r') as f: |  | ||||||
|             self._text = f.read() |  | ||||||
|  |  | ||||||
|         return self._text |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def run_command(*args): | def run_command(*args): | ||||||
|   | |||||||
 jonaswinkler
					jonaswinkler