Merge branch 'feature-ocrmypdf' into dev
13
.gitignore
vendored
@ -76,16 +76,11 @@ scripts/nuke
|
||||
/static/
|
||||
|
||||
# Stored PDFs
|
||||
/media/documents/originals/*
|
||||
/media/documents/thumbnails/*
|
||||
|
||||
/data/classification_model.pickle
|
||||
/data/db.sqlite3
|
||||
/data/index
|
||||
|
||||
/media/
|
||||
/data/
|
||||
/paperless.conf
|
||||
/consume
|
||||
/export
|
||||
/consume/
|
||||
/export/
|
||||
/src-ui/.vscode
|
||||
|
||||
# this is where the compiled frontend is moved to.
|
||||
|
@ -1,5 +1,8 @@
|
||||
language: python
|
||||
|
||||
dist: focal
|
||||
os: linux
|
||||
|
||||
jobs:
|
||||
include:
|
||||
- name: "Paperless on Python 3.6"
|
||||
@ -33,7 +36,7 @@ jobs:
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update -qq
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript optipng
|
||||
|
||||
install:
|
||||
- pip install --upgrade pipenv
|
||||
|
2
Pipfile
@ -26,7 +26,6 @@ langdetect = "*"
|
||||
pdftotext = "*"
|
||||
pathvalidate = "*"
|
||||
pillow = "*"
|
||||
pyocr = "~=0.7.2"
|
||||
python-gnupg = "*"
|
||||
python-dotenv = "*"
|
||||
python-dateutil = "*"
|
||||
@ -39,6 +38,7 @@ whitenoise = "~=5.2.0"
|
||||
watchdog = "*"
|
||||
whoosh="~=2.7.4"
|
||||
inotifyrecursive = ">=0.3.4"
|
||||
ocrmypdf = "*"
|
||||
|
||||
[dev-packages]
|
||||
coveralls = "*"
|
||||
|
298
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "d266e1f67e3090ec68aa8ecba1e8373351daf89ad5a5ab46524d123bcaf29f62"
|
||||
"sha256": "55c9136777e78d6cd362628cd1fc0c5ff36b437699b92089ce504d598004371d"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@ -44,6 +44,94 @@
|
||||
],
|
||||
"version": "==1.17.12"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
"sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e",
|
||||
"sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d",
|
||||
"sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a",
|
||||
"sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec",
|
||||
"sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362",
|
||||
"sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668",
|
||||
"sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c",
|
||||
"sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b",
|
||||
"sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654",
|
||||
"sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06",
|
||||
"sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698",
|
||||
"sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2",
|
||||
"sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c",
|
||||
"sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7",
|
||||
"sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009",
|
||||
"sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03",
|
||||
"sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b",
|
||||
"sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909",
|
||||
"sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53",
|
||||
"sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35",
|
||||
"sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26",
|
||||
"sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b",
|
||||
"sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb",
|
||||
"sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293",
|
||||
"sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd",
|
||||
"sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d",
|
||||
"sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3",
|
||||
"sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5",
|
||||
"sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d",
|
||||
"sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca",
|
||||
"sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d",
|
||||
"sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775",
|
||||
"sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375",
|
||||
"sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b",
|
||||
"sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b",
|
||||
"sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f"
|
||||
],
|
||||
"version": "==1.14.4"
|
||||
},
|
||||
"chardet": {
|
||||
"hashes": [
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"markers": "python_version >= '3.1'",
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"coloredlogs": {
|
||||
"hashes": [
|
||||
"sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a",
|
||||
"sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505",
|
||||
"sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==14.0"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
"sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538",
|
||||
"sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f",
|
||||
"sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d",
|
||||
"sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77",
|
||||
"sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b",
|
||||
"sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33",
|
||||
"sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e",
|
||||
"sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb",
|
||||
"sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e",
|
||||
"sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b",
|
||||
"sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7",
|
||||
"sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297",
|
||||
"sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d",
|
||||
"sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7",
|
||||
"sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b",
|
||||
"sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7",
|
||||
"sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4",
|
||||
"sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8",
|
||||
"sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b",
|
||||
"sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851",
|
||||
"sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13",
|
||||
"sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b",
|
||||
"sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3",
|
||||
"sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==3.2.1"
|
||||
},
|
||||
"dateparser": {
|
||||
"hashes": [
|
||||
"sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b",
|
||||
@ -123,6 +211,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==20.0.4"
|
||||
},
|
||||
"humanfriendly": {
|
||||
"hashes": [
|
||||
"sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12",
|
||||
"sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==8.2"
|
||||
},
|
||||
"imap-tools": {
|
||||
"hashes": [
|
||||
"sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246",
|
||||
@ -131,6 +227,13 @@
|
||||
"index": "pypi",
|
||||
"version": "==0.32.0"
|
||||
},
|
||||
"img2pdf": {
|
||||
"hashes": [
|
||||
"sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5",
|
||||
"sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9"
|
||||
],
|
||||
"version": "==0.4.0"
|
||||
},
|
||||
"inotify-simple": {
|
||||
"hashes": [
|
||||
"sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128",
|
||||
@ -164,6 +267,51 @@
|
||||
"index": "pypi",
|
||||
"version": "==1.0.8"
|
||||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
"sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
|
||||
"sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
|
||||
"sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
|
||||
"sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
|
||||
"sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
|
||||
"sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
|
||||
"sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
|
||||
"sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
|
||||
"sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
|
||||
"sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
|
||||
"sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
|
||||
"sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
|
||||
"sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
|
||||
"sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
|
||||
"sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
|
||||
"sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
|
||||
"sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
|
||||
"sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
|
||||
"sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
|
||||
"sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
|
||||
"sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
|
||||
"sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
|
||||
"sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
|
||||
"sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
|
||||
"sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
|
||||
"sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
|
||||
"sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
|
||||
"sha256:91d6dace31b07ab47eeadd3f4384ded2f77b94b30446410cb2c3e660e047f7a7",
|
||||
"sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
|
||||
"sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
|
||||
"sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
|
||||
"sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
|
||||
"sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
|
||||
"sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
|
||||
"sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
|
||||
"sha256:e1dbb88a937126ab14d219a000728224702e0ec0fc7ceb7131c53606b7a76772",
|
||||
"sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
|
||||
"sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
|
||||
"sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==4.6.2"
|
||||
},
|
||||
"numpy": {
|
||||
"hashes": [
|
||||
"sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db",
|
||||
@ -205,6 +353,14 @@
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.19.4"
|
||||
},
|
||||
"ocrmypdf": {
|
||||
"hashes": [
|
||||
"sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3",
|
||||
"sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==11.3.3"
|
||||
},
|
||||
"pathtools": {
|
||||
"hashes": [
|
||||
"sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0",
|
||||
@ -220,6 +376,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.3.0"
|
||||
},
|
||||
"pdfminer.six": {
|
||||
"hashes": [
|
||||
"sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a",
|
||||
"sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509"
|
||||
],
|
||||
"markers": "python_version >= '3.4'",
|
||||
"version": "==20201018"
|
||||
},
|
||||
"pdftotext": {
|
||||
"hashes": [
|
||||
"sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6"
|
||||
@ -227,6 +391,33 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.1.5"
|
||||
},
|
||||
"pikepdf": {
|
||||
"hashes": [
|
||||
"sha256:0829bd5dacd73bb4a37e7575bae523f49603479755563c92ddb55c206700cab1",
|
||||
"sha256:0d2b631077cd6af6e4d1b396208020705842610a6f13fab489d5f9c47916baa2",
|
||||
"sha256:21c98af08fae4ac9fbcad02b613b6768a4ca300fda4cba867f4a4b6f73c2d04b",
|
||||
"sha256:2240372fed30124ddc35b0c15a613f2b687a426ea2f150091e0a0c58cca7a495",
|
||||
"sha256:2a97f5f1403e058d217d7f6861cf51fca200c5687bce0d052f5f2fa89b5bfa22",
|
||||
"sha256:3faaefca0ae80d19891acec8b0dd5e6235f59f2206d82375eb80d090285e9557",
|
||||
"sha256:48ef45b64882901c0d69af3b85d16a19bd0f3e95b43e614fefb53521d8caf36c",
|
||||
"sha256:5212fe41f2323fc7356ba67caa39737fe13080562cff37bcbb74a8094076c8d0",
|
||||
"sha256:56859c32170663c57bd0658189ce44e180533eebe813853446cd6413810be9eb",
|
||||
"sha256:5f8fd1cb3478c5534222018aca24fbbd2bc74460c899bda988ec76722c13caa9",
|
||||
"sha256:74300a32c41b3d578772f6933f23a88b19f74484185e71e5225ce2f7ea5aea78",
|
||||
"sha256:8cbc946bdd217148f4a9c029fcea62f4ae0f67d5346de4c865f4718cd0ddc37f",
|
||||
"sha256:9ceefd30076f732530cf84a1be2ecb2fa9931af932706ded760a6d37c73b96ad",
|
||||
"sha256:ad69c170fda41b07a4c6b668a3128e7a759f50d9aebcfcde0ccff1358abe0423",
|
||||
"sha256:b715fe182189fb6870fab5b0383bb2fb278c88c46eade346b0f4c1ed8818c09d",
|
||||
"sha256:bb01ecf95083ffcb9ad542dc5342ccc1059e46f1395fd966629d36d9cc766b4a",
|
||||
"sha256:bd6328547219cf48cefb4e0a1bc54442910594de1c5a5feae847d9ff3c629031",
|
||||
"sha256:edb128379bb1dea76b5bdbdacf5657a6e4754bacc2049640762725590d8ed905",
|
||||
"sha256:f8e687900557fcd4c51b4e72b9e337fdae9e2c81049d1d80b624bb2e88b5769d",
|
||||
"sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
|
||||
"sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
|
||||
],
|
||||
"markers": "python_version < '3.9'",
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"pillow": {
|
||||
"hashes": [
|
||||
"sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a",
|
||||
@ -262,6 +453,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==8.0.1"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
|
||||
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.13.1"
|
||||
},
|
||||
"psycopg2-binary": {
|
||||
"hashes": [
|
||||
"sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c",
|
||||
@ -305,13 +504,13 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.8.6"
|
||||
},
|
||||
"pyocr": {
|
||||
"pycparser": {
|
||||
"hashes": [
|
||||
"sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179",
|
||||
"sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301"
|
||||
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
|
||||
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.7.2"
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.20"
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
@ -419,6 +618,53 @@
|
||||
],
|
||||
"version": "==2020.11.13"
|
||||
},
|
||||
"reportlab": {
|
||||
"hashes": [
|
||||
"sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540",
|
||||
"sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c",
|
||||
"sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030",
|
||||
"sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f",
|
||||
"sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5",
|
||||
"sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e",
|
||||
"sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89",
|
||||
"sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2",
|
||||
"sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f",
|
||||
"sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969",
|
||||
"sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53",
|
||||
"sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d",
|
||||
"sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d",
|
||||
"sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457",
|
||||
"sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec",
|
||||
"sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f",
|
||||
"sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778",
|
||||
"sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92",
|
||||
"sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6",
|
||||
"sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077",
|
||||
"sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673",
|
||||
"sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b",
|
||||
"sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48",
|
||||
"sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95",
|
||||
"sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94",
|
||||
"sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589",
|
||||
"sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9",
|
||||
"sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8",
|
||||
"sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d",
|
||||
"sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32",
|
||||
"sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b",
|
||||
"sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a",
|
||||
"sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2",
|
||||
"sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967",
|
||||
"sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5",
|
||||
"sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437",
|
||||
"sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8",
|
||||
"sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab",
|
||||
"sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e",
|
||||
"sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a",
|
||||
"sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8",
|
||||
"sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08"
|
||||
],
|
||||
"version": "==3.5.55"
|
||||
},
|
||||
"scikit-learn": {
|
||||
"hashes": [
|
||||
"sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e",
|
||||
@ -482,6 +728,13 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.15.0"
|
||||
},
|
||||
"sortedcontainers": {
|
||||
"hashes": [
|
||||
"sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f",
|
||||
"sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1"
|
||||
],
|
||||
"version": "==2.3.0"
|
||||
},
|
||||
"sqlparse": {
|
||||
"hashes": [
|
||||
"sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
|
||||
@ -498,6 +751,14 @@
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==2.1.0"
|
||||
},
|
||||
"tqdm": {
|
||||
"hashes": [
|
||||
"sha256:5c0d04e06ccc0da1bd3fa5ae4550effcce42fcad947b4a6cafa77bdc9b09ff22",
|
||||
"sha256:9e7b8ab0ecbdbf0595adadd5f0ebbb9e69010e0bd48bbb0c15e550bf2a5292df"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==4.54.0"
|
||||
},
|
||||
"tzlocal": {
|
||||
"hashes": [
|
||||
"sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
|
||||
@ -589,6 +850,7 @@
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"markers": "python_version >= '3.1'",
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"coverage": {
|
||||
@ -711,22 +973,6 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.2.0"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132",
|
||||
"sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==2.1.0"
|
||||
},
|
||||
"importlib-resources": {
|
||||
"hashes": [
|
||||
"sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592",
|
||||
"sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5"
|
||||
],
|
||||
"markers": "python_version < '3.7'",
|
||||
"version": "==3.3.0"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
@ -1038,14 +1284,6 @@
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==20.2.1"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
|
||||
"sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==3.4.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -152,6 +152,117 @@ PAPERLESS_AUTO_LOGIN_USERNAME=<username>
|
||||
|
||||
Defaults to none, which disables this feature.
|
||||
|
||||
OCR settings
|
||||
############
|
||||
|
||||
Paperless uses `OCRmyPDF <https://ocrmypdf.readthedocs.io/en/latest/>`_ for
|
||||
performing OCR on documents and images. Paperless uses sensible defaults for
|
||||
most settings, but all of them can be configured to your needs.
|
||||
|
||||
|
||||
PAPERLESS_OCR_LANGUAGE=<lang>
|
||||
Customize the language that paperless will attempt to use when
|
||||
parsing documents.
|
||||
|
||||
It should be a 3-letter language code consistent with ISO
|
||||
639: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
|
||||
Set this to the language most of your documents are written in.
|
||||
|
||||
This can be a combination of multiple languages such as ``deu+eng``,
|
||||
in which case tesseract will use whatever language matches best.
|
||||
Keep in mind that tesseract uses much more cpu time with multiple
|
||||
languages enabled.
|
||||
|
||||
Defaults to "eng".
|
||||
|
||||
PAPERLESS_OCR_MODE=<mode>
|
||||
Tell paperless when and how to perform ocr on your documents. Four modes
|
||||
are available:
|
||||
|
||||
* ``skip``: Paperless skips all pages and will perform ocr only on pages
|
||||
where no text is present. This is the safest and fastest option.
|
||||
* ``skip_noarchive``: In addition to skip, paperless won't create an
|
||||
archived version of your documents when it finds any text in them.
|
||||
* ``redo``: Paperless will OCR all pages of your documents and attempt to
|
||||
replace any existing text layers with new text. This will be useful for
|
||||
documents from scanners that already performed OCR with insufficient
|
||||
results. It will also perform OCR on purely digital documents.
|
||||
|
||||
This option may fail on some documents that have features that cannot
|
||||
be removed, such as forms. In this case, the text from the document is
|
||||
used instead.
|
||||
* ``force``: Paperless rasterizes your documents, converting any text
|
||||
into images and puts the OCRed text on top. This works for all documents,
|
||||
however, the resulting document may be significantly larger and text
|
||||
won't appear as sharp when zoomed in.
|
||||
|
||||
The default is ``skip``, which only performs OCR when necessary.
|
||||
|
||||
PAPERLESS_OCR_OUTPUT_TYPE=<type>
|
||||
Specify the the type of PDF documents that paperless should produce.
|
||||
|
||||
* ``pdf``: Modify the PDF document as little as possible.
|
||||
* ``pdfa``: Convert PDF documents into PDF/A-2b documents, which is a
|
||||
subset of the entire PDF specification and meant for storing
|
||||
documents long term.
|
||||
* ``pdfa-1``, ``pdfa-2``, ``pdfa-3`` to specify the exact version of
|
||||
PDF/A you wish to use.
|
||||
|
||||
If not specified, ``pdfa`` is used. Remember that paperless also keeps
|
||||
the original input file as well as the archived version.
|
||||
|
||||
|
||||
PAPERLESS_OCR_PAGES=<num>
|
||||
Tells paperless to use only the specified amount of pages for OCR. Documents
|
||||
with less than the specified amount of pages get OCR'ed completely.
|
||||
|
||||
Specifying 1 here will only use the first page.
|
||||
|
||||
When combined with ``PAPERLESS_OCR_MODE=redo`` or ``PAPERLESS_OCR_MODE=force``,
|
||||
paperless will not modify any text it finds on excluded pages and copy it
|
||||
verbatim.
|
||||
|
||||
Defaults to 0, which disables this feature and always uses all pages.
|
||||
|
||||
|
||||
PAPERLESS_OCR_IMAGE_DPI=<num>
|
||||
Paperless will OCR any images you put into the system and convert them
|
||||
into PDF documents. This is useful if your scanner produces images.
|
||||
In order to do so, paperless needs to know the DPI of the image.
|
||||
Most images from scanners will have this information embedded and
|
||||
paperless will detect and use that information. In case this fails, it
|
||||
uses this value as a fallback.
|
||||
|
||||
Set this to the DPI your scanner produces images at.
|
||||
|
||||
Default is none, which causes paperless to fail if no DPI information is
|
||||
present in an image.
|
||||
|
||||
|
||||
PAPERLESS_OCR_USER_ARG=<json>
|
||||
OCRmyPDF offers many more options. Use this parameter to specify any
|
||||
additional arguments you wish to pass to OCRmyPDF. Since Paperless uses
|
||||
the API of OCRmyPDF, you have to specify these in a format that can be
|
||||
passed to the API. See `https://ocrmypdf.readthedocs.io/en/latest/api.html#reference`_
|
||||
for valid parameters. All command line options are supported, but they
|
||||
use underscores instead of dashed.
|
||||
|
||||
.. caution::
|
||||
|
||||
Paperless has been tested to work with the OCR options provided
|
||||
above. There are many options that are incompatible with each other,
|
||||
so specifying invalid options may prevent paperless from consuming
|
||||
any documents.
|
||||
|
||||
Specify arguments as a JSON dictionary. Keep note of lower case booleans
|
||||
and double quoted parameter names and strings. Examples:
|
||||
|
||||
.. code:: json
|
||||
|
||||
{"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}
|
||||
|
||||
|
||||
Software tweaks
|
||||
###############
|
||||
|
||||
@ -193,37 +304,6 @@ PAPERLESS_TIME_ZONE=<timezone>
|
||||
Defaults to UTC.
|
||||
|
||||
|
||||
|
||||
PAPERLESS_OCR_PAGES=<num>
|
||||
Tells paperless to use only the specified amount of pages for OCR. Documents
|
||||
with less than the specified amount of pages get OCR'ed completely.
|
||||
|
||||
Specifying 1 here will only use the first page.
|
||||
|
||||
Defaults to 0, which disables this feature and always uses all pages.
|
||||
|
||||
|
||||
|
||||
PAPERLESS_OCR_LANGUAGE=<lang>
|
||||
Customize the default language that tesseract will attempt to use when
|
||||
parsing documents. The default language is used whenever
|
||||
|
||||
* No language could be detected on a document
|
||||
* No tesseract data files are available for the detected language
|
||||
|
||||
It should be a 3-letter language code consistent with ISO
|
||||
639: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
|
||||
Set this to the language most of your documents are written in.
|
||||
|
||||
Defaults to "eng".
|
||||
|
||||
PAPERLESS_OCR_ALWAYS=<bool>
|
||||
By default Paperless does not OCR a document if the text can be retrieved from
|
||||
the document directly. Set to true to always OCR documents.
|
||||
|
||||
Defaults to false.
|
||||
|
||||
PAPERLESS_CONSUMER_POLLING=<num>
|
||||
If paperless won't find documents added to your consume folder, it might
|
||||
not be able to automatically detect filesystem changes. In that case,
|
||||
@ -261,18 +341,6 @@ PAPERLESS_CONVERT_TMPDIR=<path>
|
||||
|
||||
Default is none, which disables the temporary directory.
|
||||
|
||||
PAPERLESS_CONVERT_DENSITY=<num>
|
||||
This setting has a high impact on the physical size of tmp page files,
|
||||
the speed of document conversion, and can affect the accuracy of OCR
|
||||
results. Individual results can vary and this setting should be tested
|
||||
thoroughly against the documents you are importing to see if it has any
|
||||
impacts either negative or positive.
|
||||
Testing on limited document sets has shown a setting of 200 can cut the
|
||||
size of tmp files by 1/3, and speed up conversion by up to 4x
|
||||
with little impact to OCR accuracy.
|
||||
|
||||
Default is 300.
|
||||
|
||||
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
|
||||
Use optipng to optimize thumbnails. This usually reduces the size of
|
||||
thumbnails by about 20%, but uses considerable compute time during
|
||||
@ -319,8 +387,5 @@ PAPERLESS_CONVERT_BINARY=<path>
|
||||
PAPERLESS_GS_BINARY=<path>
|
||||
Defaults to "/usr/bin/gs".
|
||||
|
||||
PAPERLESS_UNPAPER_BINARY=<path>
|
||||
Defaults to "/usr/bin/unpaper".
|
||||
|
||||
PAPERLESS_OPTIPNG_BINARY=<path>
|
||||
Defaults to "/usr/bin/optipng".
|
||||
|
@ -31,19 +31,24 @@
|
||||
#PAPERLESS_STATIC_URL=/static/
|
||||
#PAPERLESS_AUTO_LOGIN_USERNAME=
|
||||
|
||||
# OCR settings
|
||||
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
#PAPERLESS_OCR_MODE=skip
|
||||
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
|
||||
#PAPERLESS_OCR_PAGES=1
|
||||
#PAPERLESS_OCR_IMAGE_DPI=300
|
||||
#PAPERLESS_OCR_USER_ARG={}
|
||||
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
|
||||
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||
|
||||
# Software tweaks
|
||||
|
||||
#PAPERLESS_TASK_WORKERS=1
|
||||
#PAPERLESS_THREADS_PER_WORKER=1
|
||||
#PAPERLESS_TIME_ZONE=UTC
|
||||
#PAPERLESS_OCR_PAGES=1
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
#PAPERLESS_OCR_ALWAYS=false
|
||||
#PAPERLESS_CONSUMER_POLLING=10
|
||||
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
|
||||
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
|
||||
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||
#PAPERLESS_CONVERT_DENSITY=300
|
||||
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
|
||||
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
|
||||
#PAPERLESS_FILENAME_DATE_ORDER=YMD
|
||||
@ -53,5 +58,4 @@
|
||||
|
||||
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
|
||||
#PAPERLESS_GS_BINARY=/usr/bin/gs
|
||||
#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper
|
||||
#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng
|
||||
|
@ -5,12 +5,26 @@
|
||||
</svg>
|
||||
<span class="d-none d-lg-inline"> Delete</span>
|
||||
</button>
|
||||
<a [href]="downloadUrl" class="btn btn-sm btn-outline-primary mr-2">
|
||||
<svg class="buttonicon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#download" />
|
||||
</svg>
|
||||
<span class="d-none d-lg-inline"> Download</span>
|
||||
</a>
|
||||
|
||||
<div class="btn-group mr-2">
|
||||
|
||||
<a [href]="downloadUrl" class="btn btn-sm btn-outline-primary">
|
||||
<svg class="buttonicon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#download" />
|
||||
</svg>
|
||||
<span class="d-none d-lg-inline"> Download</span>
|
||||
</a>
|
||||
|
||||
<div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version">
|
||||
<button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button>
|
||||
<div class="dropdown-menu" ngbDropdownMenu>
|
||||
<a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<button type="button" class="btn btn-sm btn-outline-primary" (click)="close()">
|
||||
<svg class="buttonicon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#x" />
|
||||
|
@ -4,6 +4,7 @@ import { ActivatedRoute, Router } from '@angular/router';
|
||||
import { NgbModal } from '@ng-bootstrap/ng-bootstrap';
|
||||
import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent';
|
||||
import { PaperlessDocument } from 'src/app/data/paperless-document';
|
||||
import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata';
|
||||
import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
|
||||
import { DocumentListViewService } from 'src/app/services/document-list-view.service';
|
||||
import { OpenDocumentsService } from 'src/app/services/open-documents.service';
|
||||
@ -23,9 +24,11 @@ export class DocumentDetailComponent implements OnInit {
|
||||
|
||||
documentId: number
|
||||
document: PaperlessDocument
|
||||
metadata: PaperlessDocumentMetadata
|
||||
title: string
|
||||
previewUrl: string
|
||||
downloadUrl: string
|
||||
downloadOriginalUrl: string
|
||||
|
||||
correspondents: PaperlessCorrespondent[]
|
||||
documentTypes: PaperlessDocumentType[]
|
||||
@ -62,6 +65,7 @@ export class DocumentDetailComponent implements OnInit {
|
||||
this.documentId = +paramMap.get('id')
|
||||
this.previewUrl = this.documentsService.getPreviewUrl(this.documentId)
|
||||
this.downloadUrl = this.documentsService.getDownloadUrl(this.documentId)
|
||||
this.downloadOriginalUrl = this.documentsService.getDownloadUrl(this.documentId, true)
|
||||
if (this.openDocumentService.getOpenDocument(this.documentId)) {
|
||||
this.updateComponent(this.openDocumentService.getOpenDocument(this.documentId))
|
||||
} else {
|
||||
@ -76,6 +80,9 @@ export class DocumentDetailComponent implements OnInit {
|
||||
|
||||
updateComponent(doc: PaperlessDocument) {
|
||||
this.document = doc
|
||||
this.documentsService.getMetadata(doc.id).subscribe(result => {
|
||||
this.metadata = result
|
||||
})
|
||||
this.title = doc.title
|
||||
this.documentForm.patchValue(doc)
|
||||
}
|
||||
|
11
src-ui/src/app/data/paperless-document-metadata.ts
Normal file
@ -0,0 +1,11 @@
|
||||
export interface PaperlessDocumentMetadata {
|
||||
|
||||
paperless__checksum?: string
|
||||
|
||||
paperless__mime_type?: string
|
||||
|
||||
paperless__filename?: string
|
||||
|
||||
paperless__has_archive_version?: boolean
|
||||
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
import { Injectable } from '@angular/core';
|
||||
import { PaperlessDocument } from 'src/app/data/paperless-document';
|
||||
import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata';
|
||||
import { AbstractPaperlessService } from './abstract-paperless-service';
|
||||
import { HttpClient } from '@angular/common/http';
|
||||
import { Observable } from 'rxjs';
|
||||
@ -50,20 +51,32 @@ export class DocumentService extends AbstractPaperlessService<PaperlessDocument>
|
||||
return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules))
|
||||
}
|
||||
|
||||
getPreviewUrl(id: number): string {
|
||||
return this.getResourceUrl(id, 'preview')
|
||||
getPreviewUrl(id: number, original: boolean = false): string {
|
||||
let url = this.getResourceUrl(id, 'preview')
|
||||
if (original) {
|
||||
url += "?original=true"
|
||||
}
|
||||
return url
|
||||
}
|
||||
|
||||
getThumbUrl(id: number): string {
|
||||
return this.getResourceUrl(id, 'thumb')
|
||||
}
|
||||
|
||||
getDownloadUrl(id: number): string {
|
||||
return this.getResourceUrl(id, 'download')
|
||||
getDownloadUrl(id: number, original: boolean = false): string {
|
||||
let url = this.getResourceUrl(id, 'download')
|
||||
if (original) {
|
||||
url += "?original=true"
|
||||
}
|
||||
return url
|
||||
}
|
||||
|
||||
uploadDocument(formData) {
|
||||
return this.http.post(this.getResourceUrl(null, 'post_document'), formData)
|
||||
}
|
||||
|
||||
getMetadata(id: number): Observable<PaperlessDocumentMetadata> {
|
||||
return this.http.get<PaperlessDocumentMetadata>(this.getResourceUrl(id, 'metadata'))
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import os
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
@ -13,7 +14,7 @@ from .file_handling import create_source_path_directory
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type, \
|
||||
get_supported_file_extensions
|
||||
get_supported_file_extensions, parse_date
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
@ -58,7 +59,7 @@ class Consumer(LoggingMixin):
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
if Document.objects.filter(checksum=checksum).exists():
|
||||
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
raise ConsumerError(
|
||||
@ -69,6 +70,7 @@ class Consumer(LoggingMixin):
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
|
||||
|
||||
def try_consume_file(self,
|
||||
path,
|
||||
@ -124,7 +126,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser = parser_class(self.path, self.logging_group)
|
||||
document_parser = parser_class(self.logging_group)
|
||||
|
||||
# However, this already created working directories which we have to
|
||||
# clean up.
|
||||
@ -132,13 +134,24 @@ class Consumer(LoggingMixin):
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
try:
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
document_parser.parse(self.path, mime_type)
|
||||
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
thumbnail = document_parser.get_optimised_thumbnail(
|
||||
self.path, mime_type)
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if not date:
|
||||
date = parse_date(self.filename, text)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
|
||||
except ParseError as e:
|
||||
document_parser.cleanup()
|
||||
self.log(
|
||||
"error",
|
||||
f"Error while consuming document {self.filename}: {e}")
|
||||
raise ConsumerError(e)
|
||||
|
||||
# Prepare the document classifier.
|
||||
@ -180,9 +193,24 @@ class Consumer(LoggingMixin):
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
|
||||
# TODO: not required, since this is done by the file handling
|
||||
# logic
|
||||
create_source_path_directory(document.source_path)
|
||||
self._write(document, self.path, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
self.path, document.source_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
thumbnail, document.thumbnail_path)
|
||||
|
||||
if archive_path and os.path.isfile(archive_path):
|
||||
self._write(document.storage_type,
|
||||
archive_path, document.archive_path)
|
||||
|
||||
with open(archive_path, 'rb') as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read()).hexdigest()
|
||||
document.save()
|
||||
|
||||
# Afte performing all database operations and moving files
|
||||
# into place, tell paperless where the file is.
|
||||
@ -195,6 +223,11 @@ class Consumer(LoggingMixin):
|
||||
self.log("debug", "Deleting file {}".format(self.path))
|
||||
os.unlink(self.path)
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"error",
|
||||
f"The following error occured while consuming "
|
||||
f"{self.filename}: {e}"
|
||||
)
|
||||
raise ConsumerError(e)
|
||||
finally:
|
||||
document_parser.cleanup()
|
||||
@ -259,7 +292,7 @@ class Consumer(LoggingMixin):
|
||||
for tag_id in self.override_tag_ids:
|
||||
document.tags.add(Tag.objects.get(pk=tag_id))
|
||||
|
||||
def _write(self, document, source, target):
|
||||
def _write(self, storage_type, source, target):
|
||||
with open(source, "rb") as read_file:
|
||||
with open(target, "wb") as write_file:
|
||||
write_file.write(read_file.read())
|
||||
|
@ -10,10 +10,13 @@ def create_source_path_directory(source_path):
|
||||
os.makedirs(os.path.dirname(source_path), exist_ok=True)
|
||||
|
||||
|
||||
def delete_empty_directories(directory):
|
||||
def delete_empty_directories(directory, root):
|
||||
if not os.path.isdir(directory):
|
||||
return
|
||||
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
directory = os.path.normpath(directory)
|
||||
root = os.path.normpath(settings.ORIGINALS_DIR)
|
||||
root = os.path.normpath(root)
|
||||
|
||||
if not directory.startswith(root + os.path.sep):
|
||||
# don't do anything outside our originals folder.
|
||||
@ -101,3 +104,8 @@ def generate_filename(doc):
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def archive_name_from_filename(filename):
|
||||
|
||||
return os.path.splitext(filename)[0] + ".pdf"
|
||||
|
89
src/documents/management/commands/document_archiver.py
Normal file
@ -0,0 +1,89 @@
|
||||
import hashlib
|
||||
import multiprocessing
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import Document
|
||||
from ... import index
|
||||
from ...mixins import Renderable
|
||||
from ...parsers import get_parser_class_for_mime_type
|
||||
|
||||
|
||||
def handle_document(document):
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
parser = parser_class(logging_group=uuid.uuid4())
|
||||
parser.parse(document.source_path, mime_type)
|
||||
if parser.get_archive_path():
|
||||
shutil.copy(parser.get_archive_path(), document.archive_path)
|
||||
with document.archive_file as f:
|
||||
document.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
else:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Parser {parser} did not produce an archived document "
|
||||
f"for {document.file_name}"
|
||||
)
|
||||
|
||||
if parser.get_text():
|
||||
document.content = parser.get_text()
|
||||
document.save()
|
||||
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
||||
help = """
|
||||
Using the current classification model, assigns correspondents, tags
|
||||
and document types to all documents, effectively allowing you to
|
||||
back-tag all previously indexed documents with metadata created (or
|
||||
modified) after their initial import.
|
||||
""".replace(" ", "")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-f", "--overwrite",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Recreates the archived document for documents that already "
|
||||
"have an archived version."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
overwrite = options["overwrite"]
|
||||
|
||||
documents = Document.objects.all()
|
||||
|
||||
documents_to_process = filter(
|
||||
lambda d: overwrite or not os.path.exists(d.archive_path),
|
||||
documents
|
||||
)
|
||||
|
||||
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||
list(
|
||||
pool.imap(
|
||||
handle_document,
|
||||
list(documents_to_process)
|
||||
)
|
||||
)
|
||||
|
||||
ix = index.open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
for d in documents_to_process:
|
||||
index.update_document(writer, d)
|
@ -7,7 +7,8 @@ from django.core import serializers
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document, Correspondent, Tag, DocumentType
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...mixins import Renderable
|
||||
|
||||
@ -54,7 +55,6 @@ class Command(Renderable, BaseCommand):
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
unique_filename = f"{document.pk:07}_{document.file_name}"
|
||||
|
||||
file_target = os.path.join(self.target, unique_filename)
|
||||
|
||||
thumbnail_name = unique_filename + "-thumbnail.png"
|
||||
@ -63,6 +63,14 @@ class Command(Renderable, BaseCommand):
|
||||
document_dict[EXPORTER_FILE_NAME] = unique_filename
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
if os.path.exists(document.archive_path):
|
||||
archive_name = \
|
||||
f"{document.pk:07}_archive_{document.archive_file_name}"
|
||||
archive_target = os.path.join(self.target, archive_name)
|
||||
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
|
||||
else:
|
||||
archive_target = None
|
||||
|
||||
print(f"Exporting: {file_target}")
|
||||
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
@ -76,11 +84,18 @@ class Command(Renderable, BaseCommand):
|
||||
f.write(GnuPG.decrypted(document.thumbnail_file))
|
||||
os.utime(thumbnail_target, times=(t, t))
|
||||
|
||||
if archive_target:
|
||||
with open(archive_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.archive_path))
|
||||
os.utime(archive_target, times=(t, t))
|
||||
else:
|
||||
|
||||
shutil.copy(document.source_path, file_target)
|
||||
shutil.copy(document.thumbnail_path, thumbnail_target)
|
||||
|
||||
if archive_target:
|
||||
shutil.copy(document.archive_path, archive_target)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
|
||||
|
@ -7,8 +7,8 @@ from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from paperless.db import GnuPG
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from ...file_handling import generate_filename, create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
|
||||
@ -79,23 +79,41 @@ class Command(Renderable, BaseCommand):
|
||||
'appear to be in the source directory.'.format(doc_file)
|
||||
)
|
||||
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
if not os.path.exists(os.path.join(self.source, archive_file)):
|
||||
raise CommandError(
|
||||
f"The manifest file refers to {archive_file} which "
|
||||
f"does not appear to be in the source directory."
|
||||
)
|
||||
|
||||
def _import_files_from_manifest(self):
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
|
||||
|
||||
for record in self.manifest:
|
||||
|
||||
if not record["model"] == "documents.document":
|
||||
continue
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
document_path = os.path.join(self.source, doc_file)
|
||||
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
thumbnail_path = os.path.join(self.source, thumb_file)
|
||||
|
||||
document.storage_type = storage_type
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
archive_path = os.path.join(self.source, archive_file)
|
||||
else:
|
||||
archive_path = None
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
if os.path.isfile(document.source_path):
|
||||
@ -106,5 +124,7 @@ class Command(Renderable, BaseCommand):
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
if archive_path:
|
||||
shutil.copy(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
|
23
src/documents/migrations/1005_checksums.py
Normal file
@ -0,0 +1,23 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-29 00:48
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1004_sanity_check_schedule'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='archive_checksum',
|
||||
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='checksum',
|
||||
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
|
||||
),
|
||||
]
|
@ -11,6 +11,7 @@ from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
|
||||
from documents.file_handling import archive_name_from_filename
|
||||
from documents.parsers import get_default_file_extension
|
||||
|
||||
|
||||
@ -158,9 +159,15 @@ class Document(models.Model):
|
||||
max_length=32,
|
||||
editable=False,
|
||||
unique=True,
|
||||
help_text="The checksum of the original document (before it was "
|
||||
"encrypted). We use this to prevent duplicate document "
|
||||
"imports."
|
||||
help_text="The checksum of the original document."
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
max_length=32,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text="The checksum of the archived document."
|
||||
)
|
||||
|
||||
created = models.DateTimeField(
|
||||
@ -225,10 +232,30 @@ class Document(models.Model):
|
||||
def source_file(self):
|
||||
return open(self.source_path, "rb")
|
||||
|
||||
@property
|
||||
def archive_path(self):
|
||||
if self.filename:
|
||||
fname = archive_name_from_filename(self.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(self.pk)
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
@property
|
||||
def archive_file(self):
|
||||
return open(self.archive_path, "rb")
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + self.file_type
|
||||
|
||||
@property
|
||||
def archive_file_name(self):
|
||||
return slugify(str(self)) + ".pdf"
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
return get_default_file_extension(self.mime_type)
|
||||
|
@ -131,21 +131,59 @@ def run_convert(input_file,
|
||||
raise ParseError("Convert failed at {}".format(args))
|
||||
|
||||
|
||||
def run_unpaper(pnm, logging_group=None):
|
||||
pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
|
||||
def parse_date(filename, text):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
|
||||
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
|
||||
pnm_out)
|
||||
def __parser(ds, date_order):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE":
|
||||
True
|
||||
}
|
||||
)
|
||||
|
||||
logger.debug(f"Execute: {' '.join(command_args)}",
|
||||
extra={'group': logging_group})
|
||||
date = None
|
||||
|
||||
if not subprocess.Popen(command_args,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL).wait() == 0:
|
||||
raise ParseError(f"Unpaper failed at {command_args}")
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
|
||||
return pnm_out
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
for m in re.finditer(DATE_REGEX, filename):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
return date
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
return date
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
@ -158,26 +196,35 @@ class DocumentParser(LoggingMixin):
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group):
|
||||
def __init__(self, logging_group):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(
|
||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
|
||||
def get_thumbnail(self):
|
||||
self.archive_path = None
|
||||
self.text = None
|
||||
self.date = None
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_archive_path(self):
|
||||
return self.archive_path
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
Returns the path to a file we can use as a thumbnail for this document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def optimise_thumbnail(self, in_path):
|
||||
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
thumbnail = self.get_thumbnail(document_path, mime_type)
|
||||
if settings.OPTIMIZE_THUMBNAILS:
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
|
||||
|
||||
args = (settings.OPTIPNG_BINARY,
|
||||
"-silent", "-o5", in_path, "-out", out_path)
|
||||
"-silent", "-o5", thumbnail, "-out", out_path)
|
||||
|
||||
self.log('debug', f"Execute: {' '.join(args)}")
|
||||
|
||||
@ -186,97 +233,13 @@ class DocumentParser(LoggingMixin):
|
||||
|
||||
return out_path
|
||||
else:
|
||||
return in_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.optimise_thumbnail(self.get_thumbnail())
|
||||
return thumbnail
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Returns the text from the document and only the text.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
return self.text
|
||||
|
||||
def get_date(self):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
|
||||
def __parser(ds, date_order):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE":
|
||||
True
|
||||
}
|
||||
)
|
||||
|
||||
date = None
|
||||
date_string = None
|
||||
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
title = os.path.basename(self.document_path)
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
self.log("info", "Checking document title for date")
|
||||
for m in re.finditer(DATE_REGEX, title):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {} "
|
||||
"from document title"
|
||||
"".format(date.isoformat(), date_string)
|
||||
)
|
||||
return date
|
||||
|
||||
try:
|
||||
# getting text after checking filename will save time if only
|
||||
# looking at the filename instead of the whole text
|
||||
text = self.get_text()
|
||||
except ParseError:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
if date is not None:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {}".format(
|
||||
date.isoformat(),
|
||||
date_string
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
return self.date
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
|
@ -67,19 +67,34 @@ def check_sanity():
|
||||
f"Original of document {doc.pk} does not exist."))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.source_path))
|
||||
checksum = None
|
||||
try:
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read original file of document {doc.pk}: {e}"))
|
||||
else:
|
||||
if not checksum == doc.checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
if checksum and not checksum == doc.checksum:
|
||||
if os.path.isfile(doc.archive_path):
|
||||
present_files.remove(os.path.normpath(doc.archive_path))
|
||||
try:
|
||||
with doc.archive_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
f"Cannot read archive file of document {doc.pk}: {e}"
|
||||
))
|
||||
else:
|
||||
if not checksum == doc.archive_checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of archive {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
if not doc.content:
|
||||
messages.append(SanityWarning(
|
||||
|
@ -2,3 +2,4 @@
|
||||
# for exporting/importing commands
|
||||
EXPORTER_FILE_NAME = "__exported_file_name__"
|
||||
EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
|
||||
EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"
|
||||
|
@ -13,7 +13,7 @@ from rest_framework.reverse import reverse
|
||||
|
||||
from .. import index, matching
|
||||
from ..file_handling import delete_empty_directories, generate_filename, \
|
||||
create_source_path_directory
|
||||
create_source_path_directory, archive_name_from_filename
|
||||
from ..models import Document, Tag
|
||||
|
||||
|
||||
@ -169,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs):
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
for f in (instance.source_path, instance.thumbnail_path):
|
||||
try:
|
||||
os.unlink(f)
|
||||
except FileNotFoundError:
|
||||
pass # The file's already gone, so we're cool with it.
|
||||
for f in (instance.source_path,
|
||||
instance.archive_path,
|
||||
instance.thumbnail_path):
|
||||
if os.path.isfile(f):
|
||||
try:
|
||||
os.unlink(f)
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Deleted file {f}.")
|
||||
except OSError as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"While deleting document {instance.file_name}, the file "
|
||||
f"{f} could not be deleted: {e}"
|
||||
)
|
||||
|
||||
delete_empty_directories(os.path.dirname(instance.source_path))
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.source_path),
|
||||
root=settings.ORIGINALS_DIR
|
||||
)
|
||||
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.archive_path),
|
||||
root=settings.ARCHIVE_DIR
|
||||
)
|
||||
|
||||
|
||||
def validate_move(instance, old_path, new_path):
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
return False
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@ -183,55 +216,90 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
|
||||
if not instance.filename:
|
||||
# Can't update the filename if there is not filename to begin with
|
||||
# This happens after the consumer creates a new document.
|
||||
# The PK needs to be set first by saving the document once. When this
|
||||
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
|
||||
# renamed anyway. In all other cases, instance.filename will be set.
|
||||
# Can't update the filename if there is no filename to begin with
|
||||
# This happens when the consumer creates a new document.
|
||||
# The document is modified and saved multiple times, and only after
|
||||
# everything is done (i.e., the generated filename is final),
|
||||
# filename will be set to the location where the consumer has put
|
||||
# the file.
|
||||
#
|
||||
# This will in turn cause this logic to move the file where it belongs.
|
||||
return
|
||||
|
||||
old_filename = instance.filename
|
||||
old_path = instance.source_path
|
||||
new_filename = generate_filename(instance)
|
||||
|
||||
if new_filename == instance.filename:
|
||||
# Don't do anything if its the same.
|
||||
return
|
||||
|
||||
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
old_source_path = instance.source_path
|
||||
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
if not validate_move(instance, old_source_path, new_source_path):
|
||||
return
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return
|
||||
# archive files are optional, archive checksum tells us if we have one,
|
||||
# since this is None for documents without archived files.
|
||||
if instance.archive_checksum:
|
||||
new_archive_filename = archive_name_from_filename(new_filename)
|
||||
old_archive_path = instance.archive_path
|
||||
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
|
||||
new_archive_filename)
|
||||
|
||||
create_source_path_directory(new_path)
|
||||
if not validate_move(instance, old_archive_path, new_archive_path):
|
||||
return
|
||||
|
||||
create_source_path_directory(new_archive_path)
|
||||
else:
|
||||
old_archive_path = None
|
||||
new_archive_path = None
|
||||
|
||||
create_source_path_directory(new_source_path)
|
||||
|
||||
try:
|
||||
os.rename(old_path, new_path)
|
||||
os.rename(old_source_path, new_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(old_archive_path, new_archive_path)
|
||||
instance.filename = new_filename
|
||||
# Don't save here to prevent infinite recursion.
|
||||
Document.objects.filter(pk=instance.pk).update(filename=new_filename)
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_path} to {new_path}.")
|
||||
f"Moved file {old_source_path} to {new_source_path}.")
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_archive_path} to {new_archive_path}.")
|
||||
|
||||
except OSError as e:
|
||||
instance.filename = old_filename
|
||||
# this happens when we can't move a file. If that's the case for the
|
||||
# archive file, we try our best to revert the changes.
|
||||
try:
|
||||
os.rename(new_source_path, old_source_path)
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
except:
|
||||
# This is fine, since:
|
||||
# A: if we managed to move source from A to B, we will also manage
|
||||
# to move it from B to A. If not, we have a serious issue
|
||||
# that's going to get caught by the santiy checker.
|
||||
# all files remain in place and will never be overwritten,
|
||||
# so this is not the end of the world.
|
||||
# B: if moving the orignal file failed, nothing has changed anyway.
|
||||
pass
|
||||
except DatabaseError as e:
|
||||
os.rename(new_path, old_path)
|
||||
os.rename(new_source_path, old_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
instance.filename = old_filename
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
delete_empty_directories(os.path.dirname(old_path))
|
||||
if not os.path.isfile(old_source_path):
|
||||
delete_empty_directories(os.path.dirname(old_source_path),
|
||||
root=settings.ORIGINALS_DIR)
|
||||
|
||||
if old_archive_path and not os.path.isfile(old_archive_path):
|
||||
delete_empty_directories(os.path.dirname(old_archive_path),
|
||||
root=settings.ARCHIVE_DIR)
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
@ -12,7 +12,9 @@ from documents.sanity_checker import SanityFailedError
|
||||
|
||||
|
||||
def index_optimize():
|
||||
index.open_index().optimize()
|
||||
ix = index.open_index()
|
||||
writer = AsyncWriter(ix)
|
||||
writer.commit(optimize=True)
|
||||
|
||||
|
||||
def index_reindex():
|
||||
|
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
BIN
src/documents/tests/samples/documents/archive/0000001.pdf
Normal file
@ -100,6 +100,44 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_thumbnail)
|
||||
|
||||
def test_download_with_archive(self):
|
||||
|
||||
_, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
|
||||
|
||||
content = b"This is a test"
|
||||
content_archive = b"This is the same test but archived"
|
||||
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
filename = os.path.basename(filename)
|
||||
|
||||
doc = Document.objects.create(title="none", filename=filename,
|
||||
mime_type="application/pdf")
|
||||
|
||||
with open(doc.archive_path, "wb") as f:
|
||||
f.write(content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
def test_document_actions_not_existing_file(self):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
|
||||
|
@ -1,5 +1,6 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase):
|
||||
|
||||
class DummyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(DummyParser, self).__init__(path, logging_group)
|
||||
def __init__(self, logging_group, scratch_dir, archive_path):
|
||||
super(DummyParser, self).__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
self.archive_path = archive_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
return "The Text"
|
||||
def parse(self, document_path, mime_type):
|
||||
self.text = "The Text"
|
||||
|
||||
|
||||
class FaultyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(FaultyParser, self).__init__(path, logging_group)
|
||||
def __init__(self, logging_group, scratch_dir):
|
||||
super(FaultyParser, self).__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
def parse(self, document_path, mime_type):
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
@ -410,11 +412,11 @@ def fake_magic_from_file(file, mime=False):
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_dummy_parser(self, path, logging_group):
|
||||
return DummyParser(path, logging_group, self.dirs.scratch_dir)
|
||||
def make_dummy_parser(self, logging_group):
|
||||
return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())
|
||||
|
||||
def make_faulty_parser(self, path, logging_group):
|
||||
return FaultyParser(path, logging_group, self.dirs.scratch_dir)
|
||||
def make_faulty_parser(self, logging_group):
|
||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self):
|
||||
super(TestConsumer, self).setUp()
|
||||
@ -432,8 +434,16 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
self.consumer = Consumer()
|
||||
|
||||
def get_test_file(self):
|
||||
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir)
|
||||
return f
|
||||
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf")
|
||||
dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
def get_test_archive_file(self):
|
||||
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf")
|
||||
dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf")
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
|
||||
def testNormalOperation(self):
|
||||
@ -455,6 +465,13 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
document.thumbnail_path
|
||||
))
|
||||
|
||||
self.assertTrue(os.path.isfile(
|
||||
document.archive_path
|
||||
))
|
||||
|
||||
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
|
||||
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
|
||||
|
||||
self.assertFalse(os.path.isfile(filename))
|
||||
|
||||
def testOverrideFilename(self):
|
||||
@ -502,7 +519,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates(self):
|
||||
def testDuplicates1(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
@ -513,6 +530,21 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates2(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_archive_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith("It is a duplicate."))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates3(self):
|
||||
self.consumer.try_consume_file(self.get_test_archive_file())
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def testNoParsers(self, m):
|
||||
m.return_value = []
|
||||
|
140
src/documents/tests/test_date_parsing.py
Normal file
@ -0,0 +1,140 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.parsers import parse_date
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples")
|
||||
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
|
||||
def setUp(self):
|
||||
os.makedirs(self.SCRATCH, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.SCRATCH)
|
||||
|
||||
def test_date_format_1(self):
|
||||
text = "lorem ipsum 130218 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_2(self):
|
||||
text = "lorem ipsum 2018 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_3(self):
|
||||
text = "lorem ipsum 20180213 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_4(self):
|
||||
text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_5(self):
|
||||
text = (
|
||||
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_6(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_7(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"März 2019\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2019, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_8(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020"
|
||||
)
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_9(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"27. Nullmonth 2020\n"
|
||||
"März 2020\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_crazy_date_past(self, *args):
|
||||
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
|
||||
|
||||
def test_crazy_date_future(self, *args):
|
||||
self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
|
||||
|
||||
def test_crazy_date_with_spaces(self, *args):
|
||||
self.assertIsNone(parse_date("", "20 408000l 2475"))
|
||||
|
||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||
def test_filename_date_parse_invalid(self, *args):
|
||||
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
|
@ -1,12 +1,29 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
class TestDocument(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.originals_dir = tempfile.mkdtemp()
|
||||
self.thumb_dir = tempfile.mkdtemp()
|
||||
|
||||
override_settings(
|
||||
ORIGINALS_DIR=self.originals_dir,
|
||||
THUMBNAIL_DIR=self.thumb_dir,
|
||||
).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.originals_dir)
|
||||
shutil.rmtree(self.thumb_dir)
|
||||
|
||||
def test_file_deletion(self):
|
||||
document = Document.objects.create(
|
||||
correspondent=Correspondent.objects.create(name="Test0"),
|
||||
@ -19,6 +36,9 @@ class TestDocument(TestCase):
|
||||
file_path = document.source_path
|
||||
thumb_path = document.thumbnail_path
|
||||
|
||||
Path(file_path).touch()
|
||||
Path(thumb_path).touch()
|
||||
|
||||
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
|
||||
document.delete()
|
||||
mock_unlink.assert_any_call(file_path)
|
||||
|
@ -2,32 +2,17 @@ import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import DatabaseError
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from .utils import DirectoriesMixin
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
deletion_list = []
|
||||
|
||||
def add_to_deletion_list(self, dirname):
|
||||
self.deletion_list.append(dirname)
|
||||
|
||||
def setUp(self):
|
||||
folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
os.makedirs(folder + "/documents/originals")
|
||||
override_settings(MEDIA_ROOT=folder).enable()
|
||||
override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
|
||||
self.add_to_deletion_list(folder)
|
||||
|
||||
def tearDown(self):
|
||||
for dirname in self.deletion_list:
|
||||
shutil.rmtree(dirname, ignore_errors=True)
|
||||
class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_generate_source_filename(self):
|
||||
@ -104,7 +89,7 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
|
||||
@ -140,7 +125,7 @@ class TestDate(TestCase):
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
@ -196,8 +181,8 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
|
||||
self.assertTrue(os.path.isfile(important_file))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
@ -315,13 +300,12 @@ class TestDate(TestCase):
|
||||
# Create our working directory
|
||||
tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
|
||||
os.makedirs(tmp)
|
||||
self.add_to_deletion_list(tmp)
|
||||
|
||||
os.makedirs(os.path.join(tmp, "notempty"))
|
||||
Path(os.path.join(tmp, "notempty", "file")).touch()
|
||||
os.makedirs(os.path.join(tmp, "notempty", "empty"))
|
||||
|
||||
delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
|
||||
delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
|
||||
self.assertEqual(os.path.isfile(
|
||||
os.path.join(tmp, "notempty", "file")), True)
|
||||
@ -345,3 +329,159 @@ class TestDate(TestCase):
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
||||
|
||||
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
|
||||
def test_create_no_format(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_create_with_format(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
|
||||
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_archive_gone(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
#Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertFalse(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_archive_exists(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
|
||||
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.os.rename")
|
||||
def test_move_archive_error(self, m):
|
||||
|
||||
def fake_rename(src, dst):
|
||||
if "archive" in src:
|
||||
raise OSError()
|
||||
else:
|
||||
os.remove(src)
|
||||
Path(dst).touch()
|
||||
|
||||
m.side_effect = fake_rename
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_file_gone(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
#Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertFalse(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.os.rename")
|
||||
def test_move_file_error(self, m):
|
||||
|
||||
def fake_rename(src, dst):
|
||||
if "original" in src:
|
||||
raise OSError()
|
||||
else:
|
||||
os.remove(src)
|
||||
Path(dst).touch()
|
||||
|
||||
m.side_effect = fake_rename
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
def test_archive_deleted(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
doc.delete()
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertFalse(os.path.isfile(doc.source_path))
|
||||
self.assertFalse(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_database_error(self):
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
|
||||
m.side_effect = DatabaseError()
|
||||
doc.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
42
src/documents/tests/test_management_archiver.py
Normal file
@ -0,0 +1,42 @@
|
||||
import filecmp
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.management.commands.document_archiver import handle_document
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
|
||||
|
||||
class TestArchiver(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
|
||||
#self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
|
||||
#self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
|
||||
|
||||
def test_archiver(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
|
||||
call_command('document_archiver')
|
||||
|
||||
def test_handle_document(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
|
||||
handle_document(self.d1)
|
||||
|
||||
doc = Document.objects.get(id=self.d1.id)
|
||||
|
||||
self.assertIsNotNone(doc.checksum)
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
|
@ -23,10 +23,7 @@ class TestExporter(DirectoriesMixin, TestCase):
|
||||
|
||||
file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
|
||||
|
||||
with open(file, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
|
||||
Document.objects.create(checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
|
||||
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
Tag.objects.create(name="t")
|
||||
DocumentType.objects.create(name="dt")
|
||||
@ -51,6 +48,14 @@ class TestExporter(DirectoriesMixin, TestCase):
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['checksum'])
|
||||
|
||||
if document_exporter.EXPORTER_ARCHIVE_NAME in element:
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['archive_checksum'])
|
||||
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
|
||||
|
||||
self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
|
||||
|
@ -1,11 +1,13 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
|
||||
get_parser_class_for_mime_type
|
||||
get_parser_class_for_mime_type, DocumentParser
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
@ -66,6 +68,38 @@ class TestParserDiscovery(TestCase):
|
||||
)
|
||||
|
||||
|
||||
def fake_get_thumbnail(self, path, mimetype):
|
||||
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
|
||||
|
||||
|
||||
class TestBaseParser(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch
|
||||
).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.scratch)
|
||||
|
||||
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
|
||||
@override_settings(OPTIMIZE_THUMBNAILS=True)
|
||||
def test_get_optimised_thumbnail(self):
|
||||
parser = DocumentParser(None)
|
||||
|
||||
parser.get_optimised_thumbnail("any", "not important")
|
||||
|
||||
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
|
||||
@override_settings(OPTIMIZE_THUMBNAILS=False)
|
||||
def test_get_optimised_thumb_disabled(self):
|
||||
parser = DocumentParser(None)
|
||||
|
||||
path = parser.get_optimised_thumbnail("any", "not important")
|
||||
self.assertEqual(path, fake_get_thumbnail(None, None, None))
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
|
||||
def test_file_extensions(self):
|
||||
|
@ -17,10 +17,12 @@ def setup_directories():
|
||||
dirs.index_dir = os.path.join(dirs.data_dir, "index")
|
||||
dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
|
||||
dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
|
||||
dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
|
||||
|
||||
os.makedirs(dirs.index_dir, exist_ok=True)
|
||||
os.makedirs(dirs.originals_dir, exist_ok=True)
|
||||
os.makedirs(dirs.thumbnail_dir, exist_ok=True)
|
||||
os.makedirs(dirs.archive_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
DATA_DIR=dirs.data_dir,
|
||||
@ -28,6 +30,7 @@ def setup_directories():
|
||||
MEDIA_ROOT=dirs.media_dir,
|
||||
ORIGINALS_DIR=dirs.originals_dir,
|
||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||
ARCHIVE_DIR=dirs.archive_dir,
|
||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||
INDEX_DIR=dirs.index_dir,
|
||||
MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
|
||||
|
@ -1,3 +1,5 @@
|
||||
import os
|
||||
|
||||
from django.db.models import Count, Max
|
||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
||||
from django.views.decorators.cache import cache_control
|
||||
@ -126,17 +128,30 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
index.remove_document_from_index(self.get_object())
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
def file_response(self, pk, disposition):
|
||||
@staticmethod
|
||||
def original_requested(request):
|
||||
return (
|
||||
'original' in request.query_params and
|
||||
request.query_params['original'] == 'true'
|
||||
)
|
||||
|
||||
def file_response(self, pk, request, disposition):
|
||||
doc = Document.objects.get(id=pk)
|
||||
|
||||
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
|
||||
file_handle = doc.source_file
|
||||
if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
|
||||
file_handle = doc.archive_file
|
||||
filename = doc.archive_file_name
|
||||
mime_type = 'application/pdf'
|
||||
else:
|
||||
file_handle = GnuPG.decrypted(doc.source_file)
|
||||
file_handle = doc.source_file
|
||||
filename = doc.file_name
|
||||
mime_type = doc.mime_type
|
||||
|
||||
response = HttpResponse(file_handle, content_type=doc.mime_type)
|
||||
if doc.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
file_handle = GnuPG.decrypted(file_handle)
|
||||
|
||||
response = HttpResponse(file_handle, content_type=mime_type)
|
||||
response["Content-Disposition"] = '{}; filename="{}"'.format(
|
||||
disposition, doc.file_name)
|
||||
disposition, filename)
|
||||
return response
|
||||
|
||||
@action(methods=['post'], detail=False)
|
||||
@ -157,6 +172,8 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
"paperless__checksum": doc.checksum,
|
||||
"paperless__mime_type": doc.mime_type,
|
||||
"paperless__filename": doc.filename,
|
||||
"paperless__has_archive_version":
|
||||
os.path.isfile(doc.archive_path)
|
||||
})
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
@ -164,7 +181,8 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
@action(methods=['get'], detail=True)
|
||||
def preview(self, request, pk=None):
|
||||
try:
|
||||
response = self.file_response(pk, "inline")
|
||||
response = self.file_response(
|
||||
pk, request, "inline")
|
||||
return response
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
@ -181,7 +199,8 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
@action(methods=['get'], detail=True)
|
||||
def download(self, request, pk=None):
|
||||
try:
|
||||
return self.file_response(pk, "attachment")
|
||||
return self.file_response(
|
||||
pk, request, "attachment")
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
|
||||
|
@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs):
|
||||
binaries = (
|
||||
settings.CONVERT_BINARY,
|
||||
settings.OPTIPNG_BINARY,
|
||||
settings.UNPAPER_BINARY,
|
||||
"tesseract"
|
||||
)
|
||||
|
||||
|
@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta
|
||||
|
||||
MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media"))
|
||||
ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals")
|
||||
ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
|
||||
THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
|
||||
|
||||
DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data"))
|
||||
@ -348,9 +349,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
|
||||
# OCRmyPDF --output-type options are available.
|
||||
# TODO: validate this setting.
|
||||
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
||||
|
||||
# OCR all documents?
|
||||
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
|
||||
# skip. redo, force
|
||||
# TODO: validate this.
|
||||
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||
|
||||
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
|
||||
|
||||
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
|
||||
|
||||
# GNUPG needs a home directory for some reason
|
||||
GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||
@ -359,11 +368,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
|
||||
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
|
||||
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
|
||||
CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
|
||||
|
||||
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
|
||||
|
||||
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
|
||||
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||
|
||||
|
||||
# Pre-2.x versions of Paperless stored your documents locally with GPG
|
||||
|
@ -14,12 +14,21 @@ def get_tesseract_langs():
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
langs = get_tesseract_langs()
|
||||
installed_langs = get_tesseract_langs()
|
||||
|
||||
if settings.OCR_LANGUAGE not in langs:
|
||||
return [Error(
|
||||
f"The default ocr language {settings.OCR_LANGUAGE} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
|
||||
else:
|
||||
return []
|
||||
if not settings.OCR_LANGUAGE:
|
||||
return [Warning(
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||
"This means that tesseract will fallback to english."
|
||||
)]
|
||||
|
||||
specified_langs = settings.OCR_LANGUAGE.split("+")
|
||||
|
||||
for lang in specified_langs:
|
||||
if lang not in installed_langs:
|
||||
return [Error(
|
||||
f"The selected ocr language {lang} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
|
||||
|
||||
return []
|
||||
|
@ -1,23 +1,15 @@
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import langdetect
|
||||
import ocrmypdf
|
||||
import pdftotext
|
||||
import pyocr
|
||||
from PIL import Image
|
||||
from django.conf import settings
|
||||
from pyocr import PyocrException
|
||||
from ocrmypdf import InputFileError
|
||||
|
||||
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
|
||||
run_convert
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
from documents.parsers import DocumentParser, ParseError, run_convert
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group):
|
||||
super().__init__(path, logging_group)
|
||||
self._text = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
@ -44,7 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=True,
|
||||
input_file="{}[0]".format(self.document_path),
|
||||
input_file="{}[0]".format(document_path),
|
||||
output_file=out_path,
|
||||
logging_group=self.logging_group)
|
||||
except ParseError:
|
||||
@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"-q",
|
||||
"-sDEVICE=pngalpha",
|
||||
"-o", gs_out_path,
|
||||
self.document_path]
|
||||
document_path]
|
||||
if not subprocess.Popen(cmd).wait() == 0:
|
||||
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
||||
# then run convert on the output from gs
|
||||
@ -74,169 +62,126 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
return out_path
|
||||
|
||||
def _is_ocred(self):
|
||||
|
||||
# Extract text from PDF using pdftotext
|
||||
text = get_text_from_pdf(self.document_path)
|
||||
|
||||
# We assume, that a PDF with at least 50 characters contains text
|
||||
# (so no OCR required)
|
||||
return len(text) > 50
|
||||
|
||||
def get_text(self):
|
||||
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
|
||||
if not settings.OCR_ALWAYS and self._is_ocred():
|
||||
self.log("debug", "Skipping OCR, using Text from PDF")
|
||||
self._text = get_text_from_pdf(self.document_path)
|
||||
return self._text
|
||||
|
||||
images = self._get_greyscale()
|
||||
|
||||
if not images:
|
||||
raise ParseError("Empty document, nothing to do.")
|
||||
def is_image(self, mime_type):
|
||||
return mime_type in [
|
||||
"image/png",
|
||||
"image/jpeg"
|
||||
]
|
||||
|
||||
def get_dpi(self, image):
|
||||
try:
|
||||
|
||||
sample_page_index = int(len(images) / 2)
|
||||
self.log(
|
||||
"debug",
|
||||
f"Attempting language detection on page "
|
||||
f"{sample_page_index + 1} of {len(images)}...")
|
||||
|
||||
sample_page_text = self._ocr([images[sample_page_index]],
|
||||
settings.OCR_LANGUAGE)[0]
|
||||
guessed_language = self._guess_language(sample_page_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed.")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
||||
self.log(
|
||||
"debug",
|
||||
f"Detected language: {guessed_language} "
|
||||
f"(default language)")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501
|
||||
self.log(
|
||||
"warning",
|
||||
f"Detected language {guessed_language} is not available "
|
||||
f"on this system.")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
else:
|
||||
self.log("debug", f"Detected language: {guessed_language}")
|
||||
ocr_pages = self._ocr(images, ISO639[guessed_language])
|
||||
|
||||
self.log("debug", "OCR completed.")
|
||||
self._text = strip_excess_whitespace(" ".join(ocr_pages))
|
||||
return self._text
|
||||
|
||||
except OCRError as e:
|
||||
raise ParseError(e)
|
||||
|
||||
def _get_greyscale(self):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
input_file = self.document_path
|
||||
|
||||
if settings.OCR_PAGES == 1:
|
||||
input_file += "[0]"
|
||||
elif settings.OCR_PAGES > 1:
|
||||
input_file += f"[0-{settings.OCR_PAGES - 1}]"
|
||||
|
||||
self.log(
|
||||
"debug",
|
||||
f"Converting document {input_file} into greyscale images")
|
||||
|
||||
output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||
|
||||
run_convert(density=settings.CONVERT_DENSITY,
|
||||
depth="8",
|
||||
type="grayscale",
|
||||
input_file=input_file,
|
||||
output_file=output_files,
|
||||
logging_group=self.logging_group)
|
||||
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
self.log("debug", f"Running unpaper on {len(pnms)} pages...")
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
pnms = pool.map(run_unpaper, pnms)
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
return guess
|
||||
with Image.open(image) as im:
|
||||
x, y = im.info['dpi']
|
||||
return x
|
||||
except Exception as e:
|
||||
self.log('warning', f"Language detection failed with: {e}")
|
||||
self.log(
|
||||
'warning',
|
||||
f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
self.log(
|
||||
"debug",
|
||||
f"Performing OCR on {len(imgs)} page(s) with language {lang}")
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
return r
|
||||
def parse(self, document_path, mime_type):
|
||||
if settings.OCR_MODE == "skip_noarchive":
|
||||
text = get_text_from_pdf(document_path)
|
||||
if text and len(text) > 50:
|
||||
self.text = text
|
||||
return
|
||||
|
||||
def _complete_ocr_default_language(self,
|
||||
images,
|
||||
sample_page_index,
|
||||
sample_page):
|
||||
images_copy = list(images)
|
||||
del images_copy[sample_page_index]
|
||||
if images_copy:
|
||||
self.log('debug', "Continuing ocr with default language.")
|
||||
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
|
||||
ocr_pages.insert(sample_page_index, sample_page)
|
||||
return ocr_pages
|
||||
else:
|
||||
return [sample_page]
|
||||
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
||||
|
||||
ocr_args = {
|
||||
'input_file': document_path,
|
||||
'output_file': archive_path,
|
||||
'use_threads': True,
|
||||
'jobs': settings.THREADS_PER_WORKER,
|
||||
'language': settings.OCR_LANGUAGE,
|
||||
'output_type': settings.OCR_OUTPUT_TYPE,
|
||||
'progress_bar': False,
|
||||
'clean': True
|
||||
}
|
||||
|
||||
if settings.OCR_PAGES > 0:
|
||||
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
|
||||
|
||||
if settings.OCR_MODE in ['skip', 'skip_noarchive']:
|
||||
ocr_args['skip_text'] = True
|
||||
elif settings.OCR_MODE == 'redo':
|
||||
ocr_args['redo_ocr'] = True
|
||||
elif settings.OCR_MODE == 'force':
|
||||
ocr_args['force_ocr'] = True
|
||||
|
||||
if self.is_image(mime_type):
|
||||
dpi = self.get_dpi(document_path)
|
||||
if dpi:
|
||||
self.log(
|
||||
"debug",
|
||||
f"Detected DPI for image {document_path}: {dpi}"
|
||||
)
|
||||
ocr_args['image_dpi'] = dpi
|
||||
elif settings.OCR_IMAGE_DPI:
|
||||
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Cannot produce archive PDF for image {document_path}, "
|
||||
f"no DPI information is present in this image and "
|
||||
f"OCR_IMAGE_DPI is not set.")
|
||||
|
||||
if settings.OCR_USER_ARGS:
|
||||
try:
|
||||
user_args = json.loads(settings.OCR_USER_ARGS)
|
||||
ocr_args = {**ocr_args, **user_args}
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||
f"they will not be used: {e}")
|
||||
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ['OMP_THREAD_LIMIT'] = "1"
|
||||
|
||||
try:
|
||||
self.log("debug",
|
||||
f"Calling OCRmyPDF with {str(ocr_args)}")
|
||||
ocrmypdf.ocr(**ocr_args)
|
||||
# success! announce results
|
||||
self.archive_path = archive_path
|
||||
self.text = get_text_from_pdf(archive_path)
|
||||
|
||||
except InputFileError as e:
|
||||
# This happens with some PDFs when used with the redo_ocr option.
|
||||
# This is not the end of the world, we'll just use what we already
|
||||
# have in the document.
|
||||
self.text = get_text_from_pdf(document_path)
|
||||
# Also, no archived file.
|
||||
if not self.text:
|
||||
# However, if we don't have anything, fail:
|
||||
raise ParseError(e)
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(e)
|
||||
|
||||
if not self.text:
|
||||
# This may happen for files that don't have any text.
|
||||
self.log(
|
||||
'warning',
|
||||
f"Document {document_path} does not have any text."
|
||||
f"This is probably an error or you tried to add an image "
|
||||
f"without text.")
|
||||
self.text = ""
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
if not text:
|
||||
return None
|
||||
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub(
|
||||
r"([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(img) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except Exception:
|
||||
# Rotation not possible, ignore
|
||||
pass
|
||||
try:
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
except PyocrException as e:
|
||||
raise OCRError(e)
|
||||
# TODO: this needs a rework
|
||||
return no_trailing_whitespace.strip()
|
||||
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
@ -245,6 +190,9 @@ def get_text_from_pdf(pdf_file):
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
except pdftotext.Error:
|
||||
return ""
|
||||
# might not be a PDF file
|
||||
return None
|
||||
|
||||
return "\n".join(pdf)
|
||||
text = "\n".join(pdf)
|
||||
|
||||
return strip_excess_whitespace(text)
|
||||
|
BIN
src/paperless_tesseract/tests/samples/multi-page-digital.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/multi-page-images.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/no-text-alpha.png
Normal file
After Width: | Height: | Size: 32 KiB |
BIN
src/paperless_tesseract/tests/samples/simple-alpha.png
Normal file
After Width: | Height: | Size: 8.2 KiB |
BIN
src/paperless_tesseract/tests/samples/simple-no-dpi.png
Normal file
After Width: | Height: | Size: 6.8 KiB |
Before Width: | Height: | Size: 7.7 KiB After Width: | Height: | Size: 7.2 KiB |
BIN
src/paperless_tesseract/tests/samples/with-form.pdf
Normal file
@ -1,193 +0,0 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
|
||||
def setUp(self):
|
||||
os.makedirs(self.SCRATCH, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.SCRATCH)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_1(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = "lorem ipsum 130218 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_2(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = "lorem ipsum 2018 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_3(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = "lorem ipsum 20180213 lorem ipsum"
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_4(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_5(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_6(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(document.get_date(), None)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_7(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"März 2019\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
date = document.get_date()
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2019, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_8(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020"
|
||||
)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_9(self):
|
||||
input_file = os.path.join(self.SAMPLE_FILES, "")
|
||||
document = RasterisedDocumentParser(input_file, None)
|
||||
document._text = (
|
||||
"lorem ipsum\n"
|
||||
"27. Nullmonth 2020\n"
|
||||
"März 2020\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(
|
||||
document.get_date(),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-0590 00:00:00"
|
||||
)
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_crazy_date_past(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null", None)
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="01-07-2350 00:00:00"
|
||||
)
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_crazy_date_future(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null", None)
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="20 408000l 2475"
|
||||
)
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_crazy_date_with_spaces(self, *args):
|
||||
document = RasterisedDocumentParser("/dev/null", None)
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
||||
|
||||
@mock.patch(
|
||||
"paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
|
||||
return_value="No date in here"
|
||||
)
|
||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_filename_date_parse_invalid(self, *args):
|
||||
document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None)
|
||||
document.get_text()
|
||||
self.assertIsNone(document.get_date())
|
@ -1,76 +0,0 @@
|
||||
import os
|
||||
from unittest import mock, skipIf
|
||||
|
||||
import pyocr
|
||||
from django.test import TestCase
|
||||
from pyocr.libtesseract.tesseract_raw import \
|
||||
TesseractError as OtherTesseractError
|
||||
|
||||
from ..parsers import image_to_string, strip_excess_whitespace
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise OtherTesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
return "This is test text"
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
class TestOCR(TestCase):
|
||||
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
|
||||
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
|
||||
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||||
def test_image_to_string_with_text_free_page(self):
|
||||
"""
|
||||
This test is sort of silly, since it's really just reproducing an odd
|
||||
exception thrown by pyocr when it encounters a page with no text.
|
||||
Actually running this test against an installation of Tesseract results
|
||||
in a segmentation fault rooted somewhere deep inside pyocr where I
|
||||
don't care to dig. Regardless, if you run the consumer normally,
|
||||
text-free pages are now handled correctly so long as we work around
|
||||
this weird exception.
|
||||
"""
|
||||
image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"])
|
@ -1,46 +1,17 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from typing import ContextManager
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, override_settings
|
||||
from pyocr.error import TesseractError
|
||||
|
||||
from documents.parsers import ParseError, run_convert
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace
|
||||
|
||||
image_to_string_calls = []
|
||||
|
||||
|
||||
class FakeTesseract(object):
|
||||
|
||||
@staticmethod
|
||||
def can_detect_orientation():
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def detect_orientation(file_handle, lang):
|
||||
raise TesseractError("arbitrary status", "message")
|
||||
|
||||
@staticmethod
|
||||
def get_available_languages():
|
||||
return ['eng', 'deu']
|
||||
|
||||
@staticmethod
|
||||
def image_to_string(file_handle, lang):
|
||||
image_to_string_calls.append((file_handle.name, lang))
|
||||
return file_handle.read()
|
||||
|
||||
|
||||
class FakePyOcr(object):
|
||||
|
||||
@staticmethod
|
||||
def get_available_tools():
|
||||
return [FakeTesseract]
|
||||
|
||||
|
||||
def fake_convert(input_file, output_file, **kwargs):
|
||||
with open(input_file) as f:
|
||||
lines = f.readlines()
|
||||
@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs):
|
||||
f2.write(line.strip())
|
||||
|
||||
|
||||
def fake_unpaper(pnm):
|
||||
output = pnm + ".unpaper.pnm"
|
||||
shutil.copy(pnm, output)
|
||||
return output
|
||||
|
||||
|
||||
class FakeImageFile(ContextManager):
|
||||
def __init__(self, fname):
|
||||
self.fname = fname
|
||||
@ -67,142 +32,50 @@ class FakeImageFile(ContextManager):
|
||||
return os.path.basename(self.fname)
|
||||
|
||||
|
||||
fake_image = FakeImageFile
|
||||
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
|
||||
@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert)
|
||||
@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper)
|
||||
@mock.patch("paperless_tesseract.parsers.Image.open", open)
|
||||
class TestRasterisedDocumentParser(TestCase):
|
||||
class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
def assertContainsStrings(self, content, strings):
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
indices = [content.index(s) for s in strings]
|
||||
self.assertListEqual(indices, sorted(indices))
|
||||
|
||||
global image_to_string_calls
|
||||
text_cases = [
|
||||
("simple string", "simple string"),
|
||||
(
|
||||
"simple newline\n testing string",
|
||||
"simple newline\ntesting string"
|
||||
),
|
||||
(
|
||||
"utf-8 строка с пробелами в конце ",
|
||||
"utf-8 строка с пробелами в конце"
|
||||
)
|
||||
]
|
||||
|
||||
image_to_string_calls = []
|
||||
|
||||
override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch)
|
||||
|
||||
def get_input_file(self, pages):
|
||||
_, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch)
|
||||
with open(fname, "w") as f:
|
||||
f.writelines([f"line {p}\n" for p in range(pages)])
|
||||
return fname
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_simple_language_match(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_2_pages(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
|
||||
def test_parse_text_3_pages(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None)
|
||||
def test_parse_text_lang_detect_failed(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it")
|
||||
def test_parse_text_lang_not_installed(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2 line 3")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
|
||||
def test_parse_text_lang_mismatch(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
|
||||
text = parser.get_text()
|
||||
self.assertEqual(text, "line 0 line 1 line 2")
|
||||
|
||||
self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"])
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
|
||||
def test_parse_empty_doc(self):
|
||||
parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4())
|
||||
try:
|
||||
parser.get_text()
|
||||
except ParseError as e:
|
||||
self.assertEqual("Empty document, nothing to do.", str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
|
||||
class TestAuxilliaryFunctions(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
|
||||
override_settings(SCRATCH_DIR=self.scratch).enable()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.scratch)
|
||||
def test_strip_excess_whitespace(self):
|
||||
for source, result in self.text_cases:
|
||||
actual_result = strip_excess_whitespace(source)
|
||||
self.assertEqual(
|
||||
result,
|
||||
actual_result,
|
||||
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
|
||||
source,
|
||||
result,
|
||||
actual_result
|
||||
)
|
||||
)
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
|
||||
|
||||
def test_get_text_from_pdf(self):
|
||||
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf'))
|
||||
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'))
|
||||
|
||||
self.assertEqual(text.strip(), "This is a test document.")
|
||||
|
||||
def test_get_text_from_pdf_error(self):
|
||||
text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png'))
|
||||
|
||||
self.assertEqual(text.strip(), "")
|
||||
|
||||
def test_image_to_string(self):
|
||||
text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng"))
|
||||
|
||||
self.assertEqual(text, "This is a test document.")
|
||||
|
||||
def test_image_to_string_language_unavailable(self):
|
||||
try:
|
||||
image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita"))
|
||||
except OCRError as e:
|
||||
self.assertTrue("Failed loading language" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
@override_settings(OCR_ALWAYS=False)
|
||||
@mock.patch("paperless_tesseract.parsers.get_text_from_pdf")
|
||||
@mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale")
|
||||
def test_is_ocred(self, m2, m):
|
||||
parser = RasterisedDocumentParser("", uuid.uuid4())
|
||||
m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \
|
||||
"lots of text lots of text lots of text lots of text lots of text lots of text " \
|
||||
"lots of text lots of text lots of text lots of text lots of text lots of text "
|
||||
parser.get_text()
|
||||
self.assertEqual(m.call_count, 2)
|
||||
self.assertEqual(m2.call_count, 0)
|
||||
self.assertContainsStrings(text.strip(), ["This is a test document."])
|
||||
|
||||
def test_thumbnail(self):
|
||||
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
|
||||
parser.get_thumbnail()
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
|
||||
# dont really know how to test it, just call it and assert that it does not raise anything.
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.run_convert")
|
||||
@ -216,6 +89,161 @@ class TestAuxilliaryFunctions(TestCase):
|
||||
|
||||
m.side_effect = call_convert
|
||||
|
||||
parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
|
||||
parser.get_thumbnail()
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
|
||||
# dont really know how to test it, just call it and assert that it does not raise anything.
|
||||
|
||||
def test_get_dpi(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
|
||||
self.assertEqual(dpi, None)
|
||||
|
||||
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png"))
|
||||
self.assertEqual(dpi, 72)
|
||||
|
||||
def test_simple_digital(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
||||
|
||||
def test_with_form(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
|
||||
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
|
||||
|
||||
@override_settings(OCR_MODE="redo")
|
||||
def test_with_form_error(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
|
||||
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
|
||||
|
||||
@override_settings(OCR_MODE="redo")
|
||||
@mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None)
|
||||
def test_with_form_error_notext(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@override_settings(OCR_MODE="force")
|
||||
def test_with_form_force(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
|
||||
|
||||
def test_image_simple(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")
|
||||
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(parser.get_text(), ["This is a test document."])
|
||||
|
||||
def test_image_simple_alpha_fail(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png")
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
|
||||
def test_image_no_dpi_fail(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
def f():
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||
|
||||
self.assertRaises(ParseError, f)
|
||||
|
||||
@override_settings(OCR_IMAGE_DPI=72)
|
||||
def test_image_no_dpi_default(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
|
||||
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."])
|
||||
|
||||
def test_multi_page(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="skip")
|
||||
def test_multi_page_pages_skip(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
||||
def test_multi_page_pages_redo(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="force")
|
||||
def test_multi_page_pages_force(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OOCR_MODE="skip")
|
||||
def test_multi_page_analog_pages_skip(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
||||
def test_multi_page_analog_pages_redo(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
|
||||
self.assertFalse("page 3" in parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_PAGES=1, OCR_MODE="force")
|
||||
def test_multi_page_analog_pages_force(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
|
||||
self.assertFalse("page 2" in parser.get_text().lower())
|
||||
self.assertFalse("page 3" in parser.get_text().lower())
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_withtext(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
|
||||
self.assertIsNone(parser.archive_path)
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_notext(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
|
||||
self.assertTrue(os.path.join(parser.archive_path))
|
||||
self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
|
||||
|
@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
|
||||
This parser directly parses a text document (.txt, .md, or .csv)
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group):
|
||||
super().__init__(path, logging_group)
|
||||
self._text = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
The thumbnail of a text file is just a 500px wide image of the text
|
||||
rendered onto a letter-sized page.
|
||||
@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
|
||||
)
|
||||
|
||||
def read_text():
|
||||
with open(self.document_path, 'r') as src:
|
||||
with open(document_path, 'r') as src:
|
||||
lines = [line.strip() for line in src.readlines()]
|
||||
text = "\n".join([line for line in lines[:n_lines]])
|
||||
return text.replace('"', "'")
|
||||
@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):
|
||||
|
||||
return out_path
|
||||
|
||||
def get_text(self):
|
||||
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
|
||||
with open(self.document_path, 'r') as f:
|
||||
self._text = f.read()
|
||||
|
||||
return self._text
|
||||
def parse(self, document_path, mime_type):
|
||||
with open(document_path, 'r') as f:
|
||||
self.text = f.read()
|
||||
|
||||
|
||||
def run_command(*args):
|
||||
|