mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge branch 'dev' into feature-bulk-edit
This commit is contained in:
commit
8699b6931c
13
.gitignore
vendored
13
.gitignore
vendored
@ -76,16 +76,11 @@ scripts/nuke
|
||||
/static/
|
||||
|
||||
# Stored PDFs
|
||||
/media/documents/originals/*
|
||||
/media/documents/thumbnails/*
|
||||
|
||||
/data/classification_model.pickle
|
||||
/data/db.sqlite3
|
||||
/data/index
|
||||
|
||||
/media/
|
||||
/data/
|
||||
/paperless.conf
|
||||
/consume
|
||||
/export
|
||||
/consume/
|
||||
/export/
|
||||
/src-ui/.vscode
|
||||
|
||||
# this is where the compiled frontend is moved to.
|
||||
|
@ -1,5 +1,8 @@
|
||||
language: python
|
||||
|
||||
dist: focal
|
||||
os: linux
|
||||
|
||||
jobs:
|
||||
include:
|
||||
- name: "Paperless on Python 3.6"
|
||||
@ -33,7 +36,7 @@ jobs:
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update -qq
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript
|
||||
- sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript optipng
|
||||
|
||||
install:
|
||||
- pip install --upgrade pipenv
|
||||
|
5
Pipfile
5
Pipfile
@ -26,7 +26,6 @@ langdetect = "*"
|
||||
pdftotext = "*"
|
||||
pathvalidate = "*"
|
||||
pillow = "*"
|
||||
pyocr = "~=0.7.2"
|
||||
python-gnupg = "*"
|
||||
python-dotenv = "*"
|
||||
python-dateutil = "*"
|
||||
@ -38,7 +37,9 @@ scikit-learn="~=0.23.2"
|
||||
whitenoise = "~=5.2.0"
|
||||
watchdog = "*"
|
||||
whoosh="~=2.7.4"
|
||||
inotify-simple = "*"
|
||||
inotifyrecursive = "~=0.3.4"
|
||||
ocrmypdf = "*"
|
||||
tqdm = "*"
|
||||
|
||||
[dev-packages]
|
||||
coveralls = "*"
|
||||
|
352
Pipfile.lock
generated
352
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "d6432a18280c092c108e998f00bcd377c0c55ef18f26cb0b8eb64f9618b9f383"
|
||||
"sha256": "b10db53eb22d917723aa6107ff0970dc4e2aa886ee03d3ae08a994a856d57986"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@ -39,10 +39,100 @@
|
||||
},
|
||||
"blessed": {
|
||||
"hashes": [
|
||||
"sha256:7d4914079a6e8e14fbe080dcaf14dee596a088057cdc598561080e3266123b48",
|
||||
"sha256:81125aa5b84cb9dfc09ff451886f64b4b923b75c5eaf51fde9d1c48a135eb797"
|
||||
"sha256:0a74a8d3f0366db600d061273df77d44f0db07daade7bb7a4d49c8bc22ed9f74",
|
||||
"sha256:580429e7e0c6f6a42ea81b0ae5a4993b6205c6ccbb635d034b4277af8175753e"
|
||||
],
|
||||
"version": "==1.17.11"
|
||||
"version": "==1.17.12"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
"sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e",
|
||||
"sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d",
|
||||
"sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a",
|
||||
"sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec",
|
||||
"sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362",
|
||||
"sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668",
|
||||
"sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c",
|
||||
"sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b",
|
||||
"sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654",
|
||||
"sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06",
|
||||
"sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698",
|
||||
"sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2",
|
||||
"sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c",
|
||||
"sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7",
|
||||
"sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009",
|
||||
"sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03",
|
||||
"sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b",
|
||||
"sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909",
|
||||
"sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53",
|
||||
"sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35",
|
||||
"sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26",
|
||||
"sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b",
|
||||
"sha256:a5ed8c05548b54b998b9498753fb9cadbfd92ee88e884641377d8a8b291bcc01",
|
||||
"sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb",
|
||||
"sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293",
|
||||
"sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd",
|
||||
"sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d",
|
||||
"sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3",
|
||||
"sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5",
|
||||
"sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d",
|
||||
"sha256:d5ff0621c88ce83a28a10d2ce719b2ee85635e85c515f12bac99a95306da4b2e",
|
||||
"sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca",
|
||||
"sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d",
|
||||
"sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775",
|
||||
"sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375",
|
||||
"sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b",
|
||||
"sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b",
|
||||
"sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f"
|
||||
],
|
||||
"version": "==1.14.4"
|
||||
},
|
||||
"chardet": {
|
||||
"hashes": [
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"markers": "python_version >= '3.1'",
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"coloredlogs": {
|
||||
"hashes": [
|
||||
"sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a",
|
||||
"sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505",
|
||||
"sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==14.0"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
"sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538",
|
||||
"sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f",
|
||||
"sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d",
|
||||
"sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77",
|
||||
"sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b",
|
||||
"sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33",
|
||||
"sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e",
|
||||
"sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb",
|
||||
"sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e",
|
||||
"sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b",
|
||||
"sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7",
|
||||
"sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297",
|
||||
"sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d",
|
||||
"sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7",
|
||||
"sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b",
|
||||
"sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7",
|
||||
"sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4",
|
||||
"sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8",
|
||||
"sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b",
|
||||
"sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851",
|
||||
"sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13",
|
||||
"sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b",
|
||||
"sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3",
|
||||
"sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==3.2.1"
|
||||
},
|
||||
"dateparser": {
|
||||
"hashes": [
|
||||
@ -54,11 +144,11 @@
|
||||
},
|
||||
"django": {
|
||||
"hashes": [
|
||||
"sha256:14a4b7cd77297fba516fc0d92444cc2e2e388aa9de32d7a68d4a83d58f5a4927",
|
||||
"sha256:14b87775ffedab2ef6299b73343d1b4b41e5d4e2aa58c6581f114dbec01e3f8f"
|
||||
"sha256:5c866205f15e7a7123f1eec6ab939d22d5bde1416635cab259684af66d8e48a2",
|
||||
"sha256:edb10b5c45e7e9c0fb1dc00b76ec7449aca258a39ffd613dbd078c51d19c9f03"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.1.3"
|
||||
"version": "==3.1.4"
|
||||
},
|
||||
"django-cors-headers": {
|
||||
"hashes": [
|
||||
@ -70,11 +160,11 @@
|
||||
},
|
||||
"django-extensions": {
|
||||
"hashes": [
|
||||
"sha256:6809c89ca952f0e08d4e0766bc0101dfaf508d7649aced1180c091d737046ea7",
|
||||
"sha256:dc663652ac9460fd06580a973576820430c6d428720e874ae46b041fa63e0efa"
|
||||
"sha256:7cd002495ff0a0e5eb6cdd6be759600905b4e4079232ea27618fc46bdd853651",
|
||||
"sha256:c7f88625a53f631745d4f2bef9ec4dcb999ed59476393bdbbe99db8596778846"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.0.9"
|
||||
"version": "==3.1.0"
|
||||
},
|
||||
"django-filter": {
|
||||
"hashes": [
|
||||
@ -123,22 +213,53 @@
|
||||
"index": "pypi",
|
||||
"version": "==20.0.4"
|
||||
},
|
||||
"humanfriendly": {
|
||||
"hashes": [
|
||||
"sha256:175ffa628aa76da2c17369a5da5856084562cc66dfe7f82ae93ca3ef175277a6",
|
||||
"sha256:3c9ab8d28e88e6cc998e41963357736dafd555ee5bb666b50e42f6ce28dd3e3d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==9.0"
|
||||
},
|
||||
"imap-tools": {
|
||||
"hashes": [
|
||||
"sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246",
|
||||
"sha256:a3ee1827dc4ff185b259b33d0238b091a87d489f63ee59959fcc81716456c602"
|
||||
"sha256:72bf46dc135b039a5d5b59f4e079242ac15eac02a30038e8cb2dec7b153cab65",
|
||||
"sha256:75dc1c72dd76d9e577df26a1e0ec3a809b5eebce77678851458dcd2eae127ac9"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.32.0"
|
||||
"version": "==0.33.0"
|
||||
},
|
||||
"img2pdf": {
|
||||
"hashes": [
|
||||
"sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5",
|
||||
"sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9"
|
||||
],
|
||||
"version": "==0.4.0"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:6112e21359ef8f344e7178aa5b72dc6e62b38b0d008e6d3cb212c5b84df72013",
|
||||
"sha256:b0c2d3b226157ae4517d9625decf63591461c66b3a808c2666d538946519d170"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==3.1.1"
|
||||
},
|
||||
"inotify-simple": {
|
||||
"hashes": [
|
||||
"sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128",
|
||||
"sha256:854f9ac752cc1fcff6ca34e9d3d875c9a94c9b7d6eb377f63be2d481a566c6ee"
|
||||
],
|
||||
"index": "pypi",
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.3.5"
|
||||
},
|
||||
"inotifyrecursive": {
|
||||
"hashes": [
|
||||
"sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f",
|
||||
"sha256:a2c450b317693e4538416f90eb1d7858506dafe6b8b885037bd2dd9ae2dafa1e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.3.5"
|
||||
},
|
||||
"joblib": {
|
||||
"hashes": [
|
||||
"sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
|
||||
@ -156,6 +277,51 @@
|
||||
"index": "pypi",
|
||||
"version": "==1.0.8"
|
||||
},
|
||||
"lxml": {
|
||||
"hashes": [
|
||||
"sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
|
||||
"sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
|
||||
"sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
|
||||
"sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
|
||||
"sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
|
||||
"sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
|
||||
"sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
|
||||
"sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
|
||||
"sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
|
||||
"sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
|
||||
"sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
|
||||
"sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
|
||||
"sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
|
||||
"sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
|
||||
"sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
|
||||
"sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
|
||||
"sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
|
||||
"sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
|
||||
"sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
|
||||
"sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
|
||||
"sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
|
||||
"sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
|
||||
"sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
|
||||
"sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
|
||||
"sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
|
||||
"sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
|
||||
"sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
|
||||
"sha256:91d6dace31b07ab47eeadd3f4384ded2f77b94b30446410cb2c3e660e047f7a7",
|
||||
"sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
|
||||
"sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
|
||||
"sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
|
||||
"sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
|
||||
"sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
|
||||
"sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
|
||||
"sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
|
||||
"sha256:e1dbb88a937126ab14d219a000728224702e0ec0fc7ceb7131c53606b7a76772",
|
||||
"sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
|
||||
"sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
|
||||
"sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==4.6.2"
|
||||
},
|
||||
"numpy": {
|
||||
"hashes": [
|
||||
"sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db",
|
||||
@ -197,6 +363,14 @@
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.19.4"
|
||||
},
|
||||
"ocrmypdf": {
|
||||
"hashes": [
|
||||
"sha256:91e7394172cedb3be801a229dbd3d308fb5ae80cbc3a77879fa7954beea407b1",
|
||||
"sha256:e550b8e884150accab7ea41f4a576b5844594cb5cbd6ed514fbf1206720343ad"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==11.3.4"
|
||||
},
|
||||
"pathtools": {
|
||||
"hashes": [
|
||||
"sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0",
|
||||
@ -212,6 +386,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.3.0"
|
||||
},
|
||||
"pdfminer.six": {
|
||||
"hashes": [
|
||||
"sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a",
|
||||
"sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509"
|
||||
],
|
||||
"markers": "python_version >= '3.4'",
|
||||
"version": "==20201018"
|
||||
},
|
||||
"pdftotext": {
|
||||
"hashes": [
|
||||
"sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6"
|
||||
@ -219,6 +401,33 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.1.5"
|
||||
},
|
||||
"pikepdf": {
|
||||
"hashes": [
|
||||
"sha256:0829bd5dacd73bb4a37e7575bae523f49603479755563c92ddb55c206700cab1",
|
||||
"sha256:0d2b631077cd6af6e4d1b396208020705842610a6f13fab489d5f9c47916baa2",
|
||||
"sha256:21c98af08fae4ac9fbcad02b613b6768a4ca300fda4cba867f4a4b6f73c2d04b",
|
||||
"sha256:2240372fed30124ddc35b0c15a613f2b687a426ea2f150091e0a0c58cca7a495",
|
||||
"sha256:2a97f5f1403e058d217d7f6861cf51fca200c5687bce0d052f5f2fa89b5bfa22",
|
||||
"sha256:3faaefca0ae80d19891acec8b0dd5e6235f59f2206d82375eb80d090285e9557",
|
||||
"sha256:48ef45b64882901c0d69af3b85d16a19bd0f3e95b43e614fefb53521d8caf36c",
|
||||
"sha256:5212fe41f2323fc7356ba67caa39737fe13080562cff37bcbb74a8094076c8d0",
|
||||
"sha256:56859c32170663c57bd0658189ce44e180533eebe813853446cd6413810be9eb",
|
||||
"sha256:5f8fd1cb3478c5534222018aca24fbbd2bc74460c899bda988ec76722c13caa9",
|
||||
"sha256:74300a32c41b3d578772f6933f23a88b19f74484185e71e5225ce2f7ea5aea78",
|
||||
"sha256:8cbc946bdd217148f4a9c029fcea62f4ae0f67d5346de4c865f4718cd0ddc37f",
|
||||
"sha256:9ceefd30076f732530cf84a1be2ecb2fa9931af932706ded760a6d37c73b96ad",
|
||||
"sha256:ad69c170fda41b07a4c6b668a3128e7a759f50d9aebcfcde0ccff1358abe0423",
|
||||
"sha256:b715fe182189fb6870fab5b0383bb2fb278c88c46eade346b0f4c1ed8818c09d",
|
||||
"sha256:bb01ecf95083ffcb9ad542dc5342ccc1059e46f1395fd966629d36d9cc766b4a",
|
||||
"sha256:bd6328547219cf48cefb4e0a1bc54442910594de1c5a5feae847d9ff3c629031",
|
||||
"sha256:edb128379bb1dea76b5bdbdacf5657a6e4754bacc2049640762725590d8ed905",
|
||||
"sha256:f8e687900557fcd4c51b4e72b9e337fdae9e2c81049d1d80b624bb2e88b5769d",
|
||||
"sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
|
||||
"sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
|
||||
],
|
||||
"markers": "python_version < '3.9'",
|
||||
"version": "==2.2.0"
|
||||
},
|
||||
"pillow": {
|
||||
"hashes": [
|
||||
"sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a",
|
||||
@ -254,6 +463,14 @@
|
||||
"index": "pypi",
|
||||
"version": "==8.0.1"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
|
||||
"sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.13.1"
|
||||
},
|
||||
"psycopg2-binary": {
|
||||
"hashes": [
|
||||
"sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c",
|
||||
@ -297,13 +514,13 @@
|
||||
"index": "pypi",
|
||||
"version": "==2.8.6"
|
||||
},
|
||||
"pyocr": {
|
||||
"pycparser": {
|
||||
"hashes": [
|
||||
"sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179",
|
||||
"sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301"
|
||||
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
|
||||
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.7.2"
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.20"
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
@ -411,6 +628,53 @@
|
||||
],
|
||||
"version": "==2020.11.13"
|
||||
},
|
||||
"reportlab": {
|
||||
"hashes": [
|
||||
"sha256:0008b5baa39d7e3a8132c4b47ecae88d6858ad386518e754e5e7b8025ee4722b",
|
||||
"sha256:0ad5a540c336941272fe161ef3a9830da3d4b3a65a195531cebd3cad5db58b2a",
|
||||
"sha256:0c965a5691686d746f558ee1c52aa9c63a01a0e13cba61ffc661573948e32f61",
|
||||
"sha256:0fd568fa5615ae99f76289c52ff230207852ee942d4934f6c893c93d2a79544e",
|
||||
"sha256:1117d905a3404c696869c7aabec9454b43ed6acbbc73f9256c6fcea23e7ae93e",
|
||||
"sha256:1ea7c388e91ad9d823655ad6a13751ff67e8a0e7cf4065cf051b4c931cdd9450",
|
||||
"sha256:26c0ee8f62652cc7fcdc47a1cb3b34775a4d625738025c1a7edb8718bda5a315",
|
||||
"sha256:368c5b3fc3d5a541cb9dcacefa563fdb445365f517e3cbf64b4326631d1cf13c",
|
||||
"sha256:451d42fdcdd7d84587d6d9c8f5d9a7d0e997305efb606705063ca1fe8bcca551",
|
||||
"sha256:47394acba4da8e56ef8e55d8eb483b868521696ba49ab0f0fcf8a1a4a5ac6e49",
|
||||
"sha256:51b16e297f7b937fc530dd151e4b38f1d305b01c9aa10657bc32a5d2901b8ad7",
|
||||
"sha256:51c0cdcf606ded0a7b4b50050400f25125ea797fbfc3c817135993b38f8b764e",
|
||||
"sha256:55c672c579618843e0fd00140fb71f1ffebc4f1c542ac385c4f4999f2f5398d9",
|
||||
"sha256:5c34a96ecfbf595caf16178a06abcd26a5f8720e01fe1285d4c97333382cfaeb",
|
||||
"sha256:61aa89a00754b18c4f2956b8bff831f1fd3affef6476dc63462d92211941605e",
|
||||
"sha256:62234d29c97279917903e4587faf240a5dea4617be250db55386ff268eb5a7c5",
|
||||
"sha256:670f2a8dcc23bf798c39b95c64bf76ee387549b962f76783670821978a226663",
|
||||
"sha256:69387f171f6c7b55109caa6d061b17a18f2f9e724a0212c07cd692aeb369dd19",
|
||||
"sha256:6c5c8871b659f7c2975382d7b61f3c182701fa9eb62cf649c3c73ba8fc5e2595",
|
||||
"sha256:80139ceb3a568f5be908094f1701fd05391b71425e8b69aaed0d30db647ca2aa",
|
||||
"sha256:80661a76d0019b5e2c315ccd3bc7093d754067d6142b36a3a0ec4f416073d23b",
|
||||
"sha256:85a2236f324ae336da7f4b183fa99bed261bcc00ac1255ee91a504e68b086d00",
|
||||
"sha256:89a3acd98bd4478d6bbc5cb32e0665ea546c98bff8b58d5e1014659daa6ef75a",
|
||||
"sha256:8a39119fcab146bde41fd1c6d148f9ee1e2cca10c6f9c2b7eb4dd710a3a2c6ac",
|
||||
"sha256:9c31c2526401da6cc92018f68483f2aac0a731cb98435445ea4b72d46b438c84",
|
||||
"sha256:9e8ae1c3b8a1697147c5c97f00d66ab1c54d88c4615b0cdd9b1a667d7baf3eb7",
|
||||
"sha256:a479c38ab2b997ce05d3bef906783ac20cf4cb224a154e80c9018c5e4d943a35",
|
||||
"sha256:a79aab8d069543d5085d58260f18705a08acd92a4501a41261913fddc2137d46",
|
||||
"sha256:b0a8314383de853599ca531dfe55eaa49bb8d6b0bb663b2f8479b7a0f3385ea2",
|
||||
"sha256:b3d9926e64bd8008007b2d9819d7b30179b069ce95431d5060f71afc36885389",
|
||||
"sha256:c2a9a77ce4f25ffb52d705be82a9f41b47f6b0da23870ebc3587709e7242da30",
|
||||
"sha256:c578dd0799f70fb577474cd383f035c6e1057e4fe837278113f9cfa6eee4b076",
|
||||
"sha256:c5abd9d0023ad20030524ab0d5fa39d77aed025519b1fa426304ab2dd0328b89",
|
||||
"sha256:ced96125525ba21311e9512adf391170b9e149f89e27e45b06ff07b70f97a0b2",
|
||||
"sha256:d692fb88d6ef5e75242b00009b54953a0425eaa8bd3a36db9db8b396785e1f57",
|
||||
"sha256:d70c2104286459658e61388af9eee838b612986bd8a36e1d21ba36152983ac15",
|
||||
"sha256:de47c65c10ac6f0d2addb28f1b1657b1c707aca014d09d01b3b728cf19e8f791",
|
||||
"sha256:e6e7592527791841db0820a72c6afae52655a05b0b6d4df184fd2bafe82ee1ee",
|
||||
"sha256:e8a7e95ee6ea5566291b59ede5b9fadce809dca43ebfbfe11e3ff3d6492c6f0e",
|
||||
"sha256:f041759138b3a95508c4281b3db3bf9bb28636d84c554272a58a5ca7c9f9bbf4",
|
||||
"sha256:f39c7fc1fa2e4a1d9747a3effd70731a9d0e9eb5738247fa089c059eff19d43e",
|
||||
"sha256:f65ac89ee0ba569f5279360eae08783f7f2e95c9810a9846c957fbd5950f4896"
|
||||
],
|
||||
"version": "==3.5.56"
|
||||
},
|
||||
"scikit-learn": {
|
||||
"hashes": [
|
||||
"sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e",
|
||||
@ -474,6 +738,13 @@
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.15.0"
|
||||
},
|
||||
"sortedcontainers": {
|
||||
"hashes": [
|
||||
"sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f",
|
||||
"sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1"
|
||||
],
|
||||
"version": "==2.3.0"
|
||||
},
|
||||
"sqlparse": {
|
||||
"hashes": [
|
||||
"sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
|
||||
@ -490,6 +761,14 @@
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==2.1.0"
|
||||
},
|
||||
"tqdm": {
|
||||
"hashes": [
|
||||
"sha256:38b658a3e4ecf9b4f6f8ff75ca16221ae3378b2e175d846b6b33ea3a20852cf5",
|
||||
"sha256:d4f413aecb61c9779888c64ddf0c62910ad56dcbe857d8922bb505d4dbff0df1"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.54.1"
|
||||
},
|
||||
"tzlocal": {
|
||||
"hashes": [
|
||||
"sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
|
||||
@ -499,11 +778,11 @@
|
||||
},
|
||||
"watchdog": {
|
||||
"hashes": [
|
||||
"sha256:034c85530b647486e8c8477410fe79476511282658f2ce496f97106d9e5acfb8",
|
||||
"sha256:4214e1379d128b0588021880ccaf40317ee156d4603ac388b9adcf29165e0c04"
|
||||
"sha256:3caefdcc8f06a57fdc5ef2d22aa7c0bfda4f55e71a0bee74cbf3176d97536ef3",
|
||||
"sha256:e38bffc89b15bafe2a131f0e1c74924cf07dcec020c2e0a26cccd208831fcd43"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.10.3"
|
||||
"version": "==0.10.4"
|
||||
},
|
||||
"wcwidth": {
|
||||
"hashes": [
|
||||
@ -528,6 +807,14 @@
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.7.4"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
|
||||
"sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.4.0"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
@ -581,6 +868,7 @@
|
||||
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||
],
|
||||
"markers": "python_version >= '3.1'",
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"coverage": {
|
||||
@ -673,11 +961,11 @@
|
||||
},
|
||||
"faker": {
|
||||
"hashes": [
|
||||
"sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997",
|
||||
"sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e"
|
||||
"sha256:7bca5b074299ac6532be2f72979e6793f1a2403ca8105cb4cf0b385a964469c4",
|
||||
"sha256:fb21a76064847561033d8cab1cfd11af436ddf2c6fe72eb51b3cda51dff86bdc"
|
||||
],
|
||||
"markers": "python_version >= '3.5'",
|
||||
"version": "==4.17.1"
|
||||
"version": "==5.0.0"
|
||||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
@ -705,11 +993,11 @@
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132",
|
||||
"sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c"
|
||||
"sha256:6112e21359ef8f344e7178aa5b72dc6e62b38b0d008e6d3cb212c5b84df72013",
|
||||
"sha256:b0c2d3b226157ae4517d9625decf63591461c66b3a808c2666d538946519d170"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==2.1.0"
|
||||
"version": "==3.1.1"
|
||||
},
|
||||
"importlib-resources": {
|
||||
"hashes": [
|
||||
@ -780,11 +1068,11 @@
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:4357f74f47b9c12db93624a82154e9b120fa8293699949152b22065d556079f8",
|
||||
"sha256:998416ba6962ae7fbd6596850b80e17859a5753ba17c32284f67bfff33784181"
|
||||
"sha256:05af3bb85d320377db281cf254ab050e1a7ebcbf5410685a9a407e18a1f81236",
|
||||
"sha256:eb41423378682dadb7166144a4926e443093863024de508ca5c9737d6bc08376"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==20.4"
|
||||
"version": "==20.7"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
@ -1036,7 +1324,7 @@
|
||||
"sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
|
||||
"sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.4.0"
|
||||
}
|
||||
}
|
||||
|
37
README.md
37
README.md
@ -25,36 +25,43 @@ Here's what you get:
|
||||
|
||||

|
||||
|
||||
# Why Paperless-ng?
|
||||
# Features
|
||||
|
||||
I wanted to make big changes to the project that will impact the way it is used by its users greatly. Among the users who currently use paperless in production there are probably many that don't want these changes right away. I also wanted to have more control over what goes into the code and what does not. Therefore, paperless-ng was created. NG stands for both Angular (the framework used for the Frontend) and next-gen. Publishing this project under a different name also avoids confusion between paperless and paperless-ng.
|
||||
|
||||
The gist of the changes is the following:
|
||||
|
||||
* New front end. This will eventually be mobile friendly as well.
|
||||
* New full text search.
|
||||
* New email processing.
|
||||
* Performs OCR on your documents, adds selectable text to image only documents and adds tags, correspondents and document types to your documents.
|
||||
* Single page application front end. Should be pretty snappy. Will be mobile friendly in the future.
|
||||
* Includes a dashboard that shows basic statistics and has document upload.
|
||||
* Filtering by tags, correspondents, types, and more.
|
||||
* Customizable views can be saved and displayed on the dashboard.
|
||||
* Full text search with auto completion, scored results and query highlighting allows you to quickly find what you need.
|
||||
* Email processing: Paperless adds documents from your email accounts.
|
||||
* Configure multiple accounts and filters for each account.
|
||||
* When adding documents from mails, paperless can move these mails to a new folder, mark them as read, flag them or delete them.
|
||||
* Machine learning powered document matching.
|
||||
* A task processor that processes documents in parallel and also tells you when something goes wrong.
|
||||
* Code cleanup in many, MANY areas. Some of the code was just overly complicated.
|
||||
* Paperless learns from your documents and will be able to automatically assign tags, correspondents and types to documents once you've stored a few documents in paperless.
|
||||
* A task processor that processes documents in parallel and also tells you when something goes wrong. On modern multi core systems, consumption is blazing fast.
|
||||
* Code cleanup in many, MANY areas. Some of the code from OG paperless was just overly complicated.
|
||||
* More tests, more stability.
|
||||
|
||||
If you want to see some screenshots of paperless-ng in action, [some are available in the documentation](https://paperless-ng.readthedocs.io/en/latest/screenshots.html).
|
||||
|
||||
For a complete list of changes, check out the [changelog](https://paperless-ng.readthedocs.io/en/latest/changelog.html)
|
||||
For a complete list of changes from paperless, check out the [changelog](https://paperless-ng.readthedocs.io/en/latest/changelog.html)
|
||||
|
||||
# Roadmap for 1.0
|
||||
|
||||
- Make the front end nice (except mobile).
|
||||
- Test coverage at 90%.
|
||||
- Store archived documents with an embedded OCR text layer, while keeping originals available. Making good progress in the `feature-ocrmypdf` branch.
|
||||
- Fix whatever bugs I and you find
|
||||
- Fix whatever bugs I and you find.
|
||||
|
||||
## Roadmap for versions beyond 1.0
|
||||
|
||||
These are things that I want to add to paperless eventually. They are sorted by priority.
|
||||
|
||||
- **Bulk editing**. Add/remove metadata from multiple documents at once.
|
||||
- **More search.** The search backend is incredibly versatile and customizable. Searching is the most important feature of this project and thus, I want to implement things like:
|
||||
- Group and limit search results by correspondent, show “more from this” links in the results.
|
||||
- Ability to search for “Similar documents” in the search results
|
||||
- Provide corrections for mispelled queries
|
||||
- **Nested tags**. Organize tags in a hierarchical structure. This will combine the benefits of folders and tags in one coherent system.
|
||||
- **An interactive consumer** that shows its progress for documents it processes on the web page.
|
||||
- With live updates ans websockets. This already works on a dev branch, but requires a lot of new dependencies, which I'm not particular happy about.
|
||||
- Notifications when a document was added with buttons to open the new document right away.
|
||||
@ -86,7 +93,7 @@ Please open an issue and start a discussion about it!
|
||||
|
||||
## Feel like helping out?
|
||||
|
||||
There's still lots of things to be done, just have a look at that issue log. If you feel like conctributing to the project, please do! Bug fixes and improvements to the front end (I just can't seem to get some of these CSS things right) are always welcome.
|
||||
There's still lots of things to be done, just have a look at that issue log. If you feel like contributing to the project, please do! Bug fixes and improvements to the front end (I just can't seem to get some of these CSS things right) are always welcome. The documentation has some basic information on how to get started.
|
||||
|
||||
If you want to implement something big: Please start a discussion about that in the issues! Maybe I've already had something similar in mind and we can make it happen together. However, keep in mind that the general roadmap is to make the existing features stable and get them tested. See the roadmap above.
|
||||
|
||||
@ -94,7 +101,7 @@ If you want to implement something big: Please start a discussion about that in
|
||||
|
||||
Paperless has been around a while now, and people are starting to build stuff on top of it. If you're one of those people, we can add your project to this list:
|
||||
|
||||
* [Paperless App](https://github.com/bauerj/paperless_app): An Android/iOS app for Paperless.
|
||||
* [Paperless App](https://github.com/bauerj/paperless_app): An Android/iOS app for Paperless. We're working on making this compatible.
|
||||
* [Paperless Desktop](https://github.com/thomasbrueggemann/paperless-desktop): A desktop UI for your Paperless installation. Runs on Mac, Linux, and Windows.
|
||||
* [ansible-role-paperless](https://github.com/ovv/ansible-role-paperless): An easy way to get Paperless running via Ansible.
|
||||
* [paperless-cli](https://github.com/stgarf/paperless-cli): A golang command line binary to interact with a Paperless instance.
|
||||
|
@ -32,8 +32,3 @@
|
||||
# The default language to use for OCR. Set this to the language most of your
|
||||
# documents are written in.
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
|
||||
# By default Paperless does not OCR a document if the text can be retrieved from
|
||||
# the document directly. Set to true to always OCR documents. (i.e., if you
|
||||
# know that some of your documents have faulty/bad OCR data)
|
||||
#PAPERLESS_OCR_ALWAYS=true
|
||||
|
@ -23,8 +23,9 @@ wait_for_postgres() {
|
||||
echo "Waiting for PostgreSQL to start..."
|
||||
|
||||
host="${PAPERLESS_DBHOST}"
|
||||
port="${PAPERLESS_DBPORT}"
|
||||
|
||||
while !</dev/tcp/$host/5432 ;
|
||||
while !</dev/tcp/$host/$port ;
|
||||
do
|
||||
|
||||
if [ $attempt_num -eq $max_attempts ]
|
||||
|
@ -15,7 +15,7 @@ services:
|
||||
POSTGRES_PASSWORD: paperless
|
||||
|
||||
webserver:
|
||||
image: jonaswinkler/paperless-ng:0.9.3
|
||||
image: jonaswinkler/paperless-ng:0.9.5
|
||||
restart: always
|
||||
depends_on:
|
||||
- db
|
||||
|
@ -5,7 +5,7 @@ services:
|
||||
restart: always
|
||||
|
||||
webserver:
|
||||
image: jonaswinkler/paperless-ng:0.9.3
|
||||
image: jonaswinkler/paperless-ng:0.9.5
|
||||
restart: always
|
||||
depends_on:
|
||||
- broker
|
||||
|
@ -11,12 +11,17 @@ RUN apt-get update \
|
||||
curl \
|
||||
ghostscript \
|
||||
gnupg \
|
||||
icc-profiles-free \
|
||||
imagemagick \
|
||||
libatlas-base-dev \
|
||||
liblept5 \
|
||||
libmagic-dev \
|
||||
libpoppler-cpp-dev \
|
||||
libpq-dev \
|
||||
libxml2 \
|
||||
optipng \
|
||||
pngquant \
|
||||
qpdf \
|
||||
sudo \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
@ -26,6 +31,7 @@ RUN apt-get update \
|
||||
tesseract-ocr-spa \
|
||||
tzdata \
|
||||
unpaper \
|
||||
zlib1g \
|
||||
&& pip3 install --upgrade supervisor setuptools \
|
||||
&& pip install --no-cache-dir -r requirements.txt \
|
||||
&& apt-get -y purge build-essential \
|
||||
|
@ -274,6 +274,7 @@ management command:
|
||||
|
||||
This command takes no arguments.
|
||||
|
||||
.. _`administration-index`:
|
||||
|
||||
Managing the document search index
|
||||
==================================
|
||||
@ -332,6 +333,42 @@ command:
|
||||
|
||||
The command takes no arguments and processes all your mail accounts and rules.
|
||||
|
||||
.. _utilities-archiver:
|
||||
|
||||
Creating archived documents
|
||||
===========================
|
||||
|
||||
Paperless stores archived PDF/A documents alongside your original documents.
|
||||
These archived documents will also contain selectable text for image-only
|
||||
originals.
|
||||
These documents are derived from the originals, which are always stored
|
||||
unmodified. If coming from an earlier version of paperless, your documents
|
||||
won't have archived versions.
|
||||
|
||||
This command creates PDF/A documents for your documents.
|
||||
|
||||
.. code::
|
||||
|
||||
document_archiver --overwrite --document <id>
|
||||
|
||||
This command will only attempt to create archived documents when no archived
|
||||
document exists yet, unless ``--overwrite`` is specified. If ``--document <id>``
|
||||
is specified, the archiver will only process that document.
|
||||
|
||||
.. note::
|
||||
|
||||
This command essentially performs OCR on all your documents again,
|
||||
according to your settings. If you run this with ``PAPERLESS_OCR_MODE=redo``,
|
||||
it will potentially run for a very long time. You can cancel the command
|
||||
at any time, since this command will skip already archived versions the next time
|
||||
it is run.
|
||||
|
||||
.. note::
|
||||
|
||||
Some documents will cause errors and cannot be converted into PDF/A documents,
|
||||
such as encrypted PDF documents. The archiver will skip over these Documents
|
||||
each time it sees them.
|
||||
|
||||
.. _utilities-encyption:
|
||||
|
||||
Managing encryption
|
||||
|
64
docs/api.rst
64
docs/api.rst
@ -38,6 +38,50 @@ individual documents:
|
||||
are in place. However, if you use these old URLs to access documents, you
|
||||
should update your app or script to use the new URLs.
|
||||
|
||||
.. note::
|
||||
|
||||
The document endpoint provides tags, document types and correspondents as
|
||||
ids in their corresponding fields. These are writeable. Paperless also
|
||||
offers read-only objects for assigned tags, types and correspondents,
|
||||
however, these might be removed in the future. As for now, the front end
|
||||
requires them.
|
||||
|
||||
Authorization
|
||||
#############
|
||||
|
||||
The REST api provides three different forms of authentication.
|
||||
|
||||
1. Basic authentication
|
||||
|
||||
Authorize by providing a HTTP header in the form
|
||||
|
||||
.. code::
|
||||
|
||||
Authorization: Basic <credentials>
|
||||
|
||||
where ``credentials`` is a base64-encoded string of ``<username>:<password>``
|
||||
|
||||
2. Session authentication
|
||||
|
||||
When you're logged into paperless in your browser, you're automatically
|
||||
logged into the API as well and don't need to provide any authorization
|
||||
headers.
|
||||
|
||||
3. Token authentication
|
||||
|
||||
Paperless also offers an endpoint to acquire authentication tokens.
|
||||
|
||||
POST a username and password as a form or json string to ``/api/token/``
|
||||
and paperless will respond with a token, if the login data is correct.
|
||||
This token can be used to authenticate other requests with the
|
||||
following HTTP header:
|
||||
|
||||
.. code::
|
||||
|
||||
Authorization: Token <token>
|
||||
|
||||
Tokens can be managed and revoked in the paperless admin.
|
||||
|
||||
Searching for documents
|
||||
#######################
|
||||
|
||||
@ -65,6 +109,7 @@ Result list object returned by the endpoint:
|
||||
"count": 1,
|
||||
"page": 1,
|
||||
"page_count": 1,
|
||||
"corrected_query": "",
|
||||
"results": [
|
||||
|
||||
]
|
||||
@ -75,6 +120,8 @@ Result list object returned by the endpoint:
|
||||
the page you requested, if you requested a page that is behind
|
||||
the last page. In that case, the last page is returned.
|
||||
* ``page_count``: The total number of pages.
|
||||
* ``corrected_query``: Corrected version of the query string. Can be null.
|
||||
If not null, can be used verbatim to start a new query.
|
||||
* ``results``: A list of result objects on the current page.
|
||||
|
||||
Result object:
|
||||
@ -166,8 +213,17 @@ The API provides a special endpoint for file uploads:
|
||||
|
||||
POST a multipart form to this endpoint, where the form field ``document`` contains
|
||||
the document that you want to upload to paperless. The filename is sanitized and
|
||||
then used to store the document in the consumption folder, where the consumer will
|
||||
detect the document and process it as any other document.
|
||||
then used to store the document in a temporary directory, and the consumer will
|
||||
be instructed to consume the document from there.
|
||||
|
||||
The endpoint will immediately return "OK." if the document was stored in the
|
||||
consumption directory.
|
||||
The endpoint supports the following optional form fields:
|
||||
|
||||
* ``title``: Specify a title that the consumer should use for the document.
|
||||
* ``correspondent``: Specify the ID of a correspondent that the consumer should use for the document.
|
||||
* ``document_type``: Similar to correspondent.
|
||||
* ``tags``: Similar to correspondent. Specify this multiple times to have multiple tags added
|
||||
to the document.
|
||||
|
||||
The endpoint will immediately return "OK" if the document consumption process
|
||||
was started successfully. No additional status information about the consumption
|
||||
process itself is available, since that happens in a different process.
|
||||
|
@ -5,15 +5,83 @@
|
||||
Changelog
|
||||
*********
|
||||
|
||||
paperless-ng 0.9.5
|
||||
##################
|
||||
|
||||
This release concludes the big changes I wanted to get rolled into paperless. The next releases before 1.0 will
|
||||
focus on fixing issues, primarily.
|
||||
|
||||
* OCR
|
||||
|
||||
* Paperless now uses `OCRmyPDF <https://github.com/jbarlow83/OCRmyPDF>`_ to perform OCR on documents.
|
||||
It still uses tesseract under the hood, but the PDF parser of Paperless has changed considerably and
|
||||
will behave different for some douments.
|
||||
* OCRmyPDF creates archived PDF/A documents with embedded text that can be selected in the front end.
|
||||
* Paperless stores archived versions of documents alongside with the originals. The originals can be
|
||||
accessed on the document edit page. If available, a dropdown menu will appear next to the download button.
|
||||
* Many of the configuration options regarding OCR have changed. See :ref:`configuration-ocr` for details.
|
||||
* Paperless no longer guesses the language of your documents. It always uses the language that you
|
||||
specified with ``PAPERLESS_OCR_LANGUAGE``. Be sure to set this to the language the majority of your
|
||||
documents are in. Multiple languages can be specified, but that requires more CPU time.
|
||||
* The management command :ref:`document_archiver <utilities-archiver>` can be used to create archived versions for already
|
||||
existing documents.
|
||||
|
||||
* Tags from consumption folder.
|
||||
|
||||
* Thanks to `jayme-github`_, paperless now consumes files from sub folders in the consumption folder and is able to assign tags
|
||||
based on the sub folders a document was found in. This can be configured with ``PAPERLESS_CONSUMER_RECURSIVE`` and
|
||||
``PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS``.
|
||||
|
||||
* API
|
||||
|
||||
* The API now offers token authentication.
|
||||
* The endpoint for uploading documents now supports specifying custom titles, correspondents, tags and types.
|
||||
This can be used by clients to override the default behavior of paperless. See :ref:`api-file_uploads`.
|
||||
* The document endpoint of API now serves documents in this form:
|
||||
|
||||
* correspondents, document types and tags are referenced by their ID in the fields ``correspondent``, ``document_type`` and ``tags``. The ``*_id`` versions are gone. These fields are read/write.
|
||||
* paperless does not serve nested tags, correspondents or types anymore.
|
||||
|
||||
* Front end
|
||||
|
||||
* Paperless does some basic caching of correspondents, tags and types and will only request them from the server when necessary or when entirely reloading the page.
|
||||
* Document list fetching is about 10%-30% faster now, especially when lots of tags/correspondents are present.
|
||||
* Some minor improvements to the front end, such as document count in the document list, better highlighting of the current page, and improvements to the filter behavior.
|
||||
|
||||
* Fixes:
|
||||
|
||||
* A bug with the generation of filenames for files with unsupported types caused the exporter and
|
||||
document saving to crash.
|
||||
* Mail handling no longer exits entirely when encountering errors. It will skip the account/rule/message on which the error occured.
|
||||
* Assigning correspondents from mail sender names failed for very long names. Paperless no longer assigns correspondents in these cases.
|
||||
|
||||
paperless-ng 0.9.4
|
||||
##################
|
||||
|
||||
* Front end: Clickable tags, correspondents and types allow quick filtering for related documents.
|
||||
* Front end: Saved views are now editable.
|
||||
* Front end: Preview documents directly in the browser.
|
||||
* Searching:
|
||||
|
||||
* Paperless now supports searching by tags, types and dates and correspondents. In order to have this applied to your
|
||||
existing documents, you need to perform a ``document_index reindex`` management command
|
||||
(see :ref:`administration-index`)
|
||||
that adds the data to the search index. You only need to do this once, since the schema of the search index changed.
|
||||
Paperless keeps the index updated after that whenever something changes.
|
||||
* Paperless now has spelling corrections ("Did you mean") for miss-typed queries.
|
||||
* The documentation contains :ref:`information about the query syntax <basic-searching>`.
|
||||
|
||||
* Front end:
|
||||
|
||||
* Clickable tags, correspondents and types allow quick filtering for related documents.
|
||||
* Saved views are now editable.
|
||||
* Preview documents directly in the browser.
|
||||
* Navigation from the dashboard to saved views.
|
||||
|
||||
* Fixes:
|
||||
|
||||
* A severe error when trying to use post consume scripts.
|
||||
* The documentation now contains information about bare metal installs.
|
||||
* An error in the consumer that cause invalid messages of missing files to show up in the log.
|
||||
|
||||
* The documentation now contains information about bare metal installs and a section about
|
||||
how to setup the development environment.
|
||||
|
||||
paperless-ng 0.9.3
|
||||
##################
|
||||
@ -732,6 +800,7 @@ bulk of the work on this big change.
|
||||
|
||||
* Initial release
|
||||
|
||||
.. _jayme-github: http://github.com/jayme-github
|
||||
.. _Brian Conn: https://github.com/TheConnMan
|
||||
.. _Christopher Luu: https://github.com/nuudles
|
||||
.. _Florian Jung: https://github.com/the01
|
||||
|
51
docs/conf.py
51
docs/conf.py
@ -1,48 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Paperless documentation build configuration file, created by
|
||||
# sphinx-quickstart on Mon Oct 26 18:36:52 2015.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
# containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
import sphinx_rtd_theme
|
||||
|
||||
|
||||
__version__ = None
|
||||
exec(open("../src/paperless/version.py").read())
|
||||
|
||||
|
||||
# Believe it or not, this is the officially sanctioned way to add custom CSS.
|
||||
def setup(app):
|
||||
app.add_stylesheet("custom.css")
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
#needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.todo',
|
||||
'sphinx.ext.imgmath',
|
||||
'sphinx.ext.viewcode',
|
||||
'sphinx_rtd_theme',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
# templates_path = ['_templates']
|
||||
|
||||
# The suffix of source filenames.
|
||||
source_suffix = '.rst'
|
||||
@ -115,7 +88,7 @@ pygments_style = 'sphinx'
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
html_theme = 'default'
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
@ -195,20 +168,6 @@ html_static_path = ['_static']
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'paperless'
|
||||
|
||||
|
||||
#
|
||||
# Attempt to use the ReadTheDocs theme. If it's not installed, fallback to
|
||||
# the default.
|
||||
#
|
||||
|
||||
try:
|
||||
import sphinx_rtd_theme
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
|
||||
except ImportError as e:
|
||||
print("error " + str(e))
|
||||
pass
|
||||
|
||||
# -- Options for LaTeX output ---------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
|
@ -152,6 +152,122 @@ PAPERLESS_AUTO_LOGIN_USERNAME=<username>
|
||||
|
||||
Defaults to none, which disables this feature.
|
||||
|
||||
.. _configuration-ocr:
|
||||
|
||||
OCR settings
|
||||
############
|
||||
|
||||
Paperless uses `OCRmyPDF <https://ocrmypdf.readthedocs.io/en/latest/>`_ for
|
||||
performing OCR on documents and images. Paperless uses sensible defaults for
|
||||
most settings, but all of them can be configured to your needs.
|
||||
|
||||
|
||||
PAPERLESS_OCR_LANGUAGE=<lang>
|
||||
Customize the language that paperless will attempt to use when
|
||||
parsing documents.
|
||||
|
||||
It should be a 3-letter language code consistent with ISO
|
||||
639: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
|
||||
Set this to the language most of your documents are written in.
|
||||
|
||||
This can be a combination of multiple languages such as ``deu+eng``,
|
||||
in which case tesseract will use whatever language matches best.
|
||||
Keep in mind that tesseract uses much more cpu time with multiple
|
||||
languages enabled.
|
||||
|
||||
Defaults to "eng".
|
||||
|
||||
PAPERLESS_OCR_MODE=<mode>
|
||||
Tell paperless when and how to perform ocr on your documents. Four modes
|
||||
are available:
|
||||
|
||||
* ``skip``: Paperless skips all pages and will perform ocr only on pages
|
||||
where no text is present. This is the safest option.
|
||||
* ``skip_noarchive``: In addition to skip, paperless won't create an
|
||||
archived version of your documents when it finds any text in them.
|
||||
This is useful if you don't want to have two almost-identical versions
|
||||
of your digital documents in the media folder. This is the fastest option.
|
||||
* ``redo``: Paperless will OCR all pages of your documents and attempt to
|
||||
replace any existing text layers with new text. This will be useful for
|
||||
documents from scanners that already performed OCR with insufficient
|
||||
results. It will also perform OCR on purely digital documents.
|
||||
|
||||
This option may fail on some documents that have features that cannot
|
||||
be removed, such as forms. In this case, the text from the document is
|
||||
used instead.
|
||||
* ``force``: Paperless rasterizes your documents, converting any text
|
||||
into images and puts the OCRed text on top. This works for all documents,
|
||||
however, the resulting document may be significantly larger and text
|
||||
won't appear as sharp when zoomed in.
|
||||
|
||||
The default is ``skip``, which only performs OCR when necessary and always
|
||||
creates archived documents.
|
||||
|
||||
PAPERLESS_OCR_OUTPUT_TYPE=<type>
|
||||
Specify the the type of PDF documents that paperless should produce.
|
||||
|
||||
* ``pdf``: Modify the PDF document as little as possible.
|
||||
* ``pdfa``: Convert PDF documents into PDF/A-2b documents, which is a
|
||||
subset of the entire PDF specification and meant for storing
|
||||
documents long term.
|
||||
* ``pdfa-1``, ``pdfa-2``, ``pdfa-3`` to specify the exact version of
|
||||
PDF/A you wish to use.
|
||||
|
||||
If not specified, ``pdfa`` is used. Remember that paperless also keeps
|
||||
the original input file as well as the archived version.
|
||||
|
||||
|
||||
PAPERLESS_OCR_PAGES=<num>
|
||||
Tells paperless to use only the specified amount of pages for OCR. Documents
|
||||
with less than the specified amount of pages get OCR'ed completely.
|
||||
|
||||
Specifying 1 here will only use the first page.
|
||||
|
||||
When combined with ``PAPERLESS_OCR_MODE=redo`` or ``PAPERLESS_OCR_MODE=force``,
|
||||
paperless will not modify any text it finds on excluded pages and copy it
|
||||
verbatim.
|
||||
|
||||
Defaults to 0, which disables this feature and always uses all pages.
|
||||
|
||||
|
||||
PAPERLESS_OCR_IMAGE_DPI=<num>
|
||||
Paperless will OCR any images you put into the system and convert them
|
||||
into PDF documents. This is useful if your scanner produces images.
|
||||
In order to do so, paperless needs to know the DPI of the image.
|
||||
Most images from scanners will have this information embedded and
|
||||
paperless will detect and use that information. In case this fails, it
|
||||
uses this value as a fallback.
|
||||
|
||||
Set this to the DPI your scanner produces images at.
|
||||
|
||||
Default is none, which causes paperless to fail if no DPI information is
|
||||
present in an image.
|
||||
|
||||
|
||||
PAPERLESS_OCR_USER_ARG=<json>
|
||||
OCRmyPDF offers many more options. Use this parameter to specify any
|
||||
additional arguments you wish to pass to OCRmyPDF. Since Paperless uses
|
||||
the API of OCRmyPDF, you have to specify these in a format that can be
|
||||
passed to the API. See `the API reference of OCRmyPDF <https://ocrmypdf.readthedocs.io/en/latest/api.html#reference>`_
|
||||
for valid parameters. All command line options are supported, but they
|
||||
use underscores instead of dashed.
|
||||
|
||||
.. caution::
|
||||
|
||||
Paperless has been tested to work with the OCR options provided
|
||||
above. There are many options that are incompatible with each other,
|
||||
so specifying invalid options may prevent paperless from consuming
|
||||
any documents.
|
||||
|
||||
Specify arguments as a JSON dictionary. Keep note of lower case booleans
|
||||
and double quoted parameter names and strings. Examples:
|
||||
|
||||
.. code:: json
|
||||
|
||||
{"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}
|
||||
|
||||
|
||||
Software tweaks
|
||||
###############
|
||||
|
||||
@ -160,6 +276,7 @@ PAPERLESS_TASK_WORKERS=<num>
|
||||
maintain the automatic matching algorithm, check emails, consume documents,
|
||||
etc. This variable specifies how many things it will do in parallel.
|
||||
|
||||
|
||||
PAPERLESS_THREADS_PER_WORKER=<num>
|
||||
Furthermore, paperless uses multiple threads when consuming documents to
|
||||
speed up OCR. This variable specifies how many pages paperless will process
|
||||
@ -184,7 +301,6 @@ PAPERLESS_THREADS_PER_WORKER=<num>
|
||||
PAPERLESS_THREADS_PER_WORKER automatically.
|
||||
|
||||
|
||||
|
||||
PAPERLESS_TIME_ZONE=<timezone>
|
||||
Set the time zone here.
|
||||
See https://docs.djangoproject.com/en/3.1/ref/settings/#std:setting-TIME_ZONE
|
||||
@ -193,37 +309,6 @@ PAPERLESS_TIME_ZONE=<timezone>
|
||||
Defaults to UTC.
|
||||
|
||||
|
||||
|
||||
PAPERLESS_OCR_PAGES=<num>
|
||||
Tells paperless to use only the specified amount of pages for OCR. Documents
|
||||
with less than the specified amount of pages get OCR'ed completely.
|
||||
|
||||
Specifying 1 here will only use the first page.
|
||||
|
||||
Defaults to 0, which disables this feature and always uses all pages.
|
||||
|
||||
|
||||
|
||||
PAPERLESS_OCR_LANGUAGE=<lang>
|
||||
Customize the default language that tesseract will attempt to use when
|
||||
parsing documents. The default language is used whenever
|
||||
|
||||
* No language could be detected on a document
|
||||
* No tesseract data files are available for the detected language
|
||||
|
||||
It should be a 3-letter language code consistent with ISO
|
||||
639: https://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
|
||||
Set this to the language most of your documents are written in.
|
||||
|
||||
Defaults to "eng".
|
||||
|
||||
PAPERLESS_OCR_ALWAYS=<bool>
|
||||
By default Paperless does not OCR a document if the text can be retrieved from
|
||||
the document directly. Set to true to always OCR documents.
|
||||
|
||||
Defaults to false.
|
||||
|
||||
PAPERLESS_CONSUMER_POLLING=<num>
|
||||
If paperless won't find documents added to your consume folder, it might
|
||||
not be able to automatically detect filesystem changes. In that case,
|
||||
@ -232,12 +317,32 @@ PAPERLESS_CONSUMER_POLLING=<num>
|
||||
|
||||
Defaults to 0, which disables polling and uses filesystem notifications.
|
||||
|
||||
|
||||
PAPERLESS_CONSUMER_DELETE_DUPLICATES=<bool>
|
||||
When the consumer detects a duplicate document, it will not touch the
|
||||
original document. This default behavior can be changed here.
|
||||
|
||||
Defaults to false.
|
||||
|
||||
|
||||
PAPERLESS_CONSUMER_RECURSIVE=<bool>
|
||||
Enable recursive watching of the consumption directory. Paperless will
|
||||
then pickup files from files in subdirectories within your consumption
|
||||
directory as well.
|
||||
|
||||
Defaults to false.
|
||||
|
||||
|
||||
PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS=<bool>
|
||||
Set the names of subdirectories as tags for consumed files.
|
||||
E.g. <CONSUMPTION_DIR>/foo/bar/file.pdf will add the tags "foo" and "bar" to
|
||||
the consumed file. Paperless will create any tags that don't exist yet.
|
||||
|
||||
PAPERLESS_CONSUMER_RECURSIVE must be enabled for this to work.
|
||||
|
||||
Defaults to false.
|
||||
|
||||
|
||||
PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
|
||||
On smaller systems, or even in the case of Very Large Documents, the consumer
|
||||
may explode, complaining about how it's "unable to extend pixel cache". In
|
||||
@ -261,18 +366,6 @@ PAPERLESS_CONVERT_TMPDIR=<path>
|
||||
|
||||
Default is none, which disables the temporary directory.
|
||||
|
||||
PAPERLESS_CONVERT_DENSITY=<num>
|
||||
This setting has a high impact on the physical size of tmp page files,
|
||||
the speed of document conversion, and can affect the accuracy of OCR
|
||||
results. Individual results can vary and this setting should be tested
|
||||
thoroughly against the documents you are importing to see if it has any
|
||||
impacts either negative or positive.
|
||||
Testing on limited document sets has shown a setting of 200 can cut the
|
||||
size of tmp files by 1/3, and speed up conversion by up to 4x
|
||||
with little impact to OCR accuracy.
|
||||
|
||||
Default is 300.
|
||||
|
||||
PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
|
||||
Use optipng to optimize thumbnails. This usually reduces the size of
|
||||
thumbnails by about 20%, but uses considerable compute time during
|
||||
@ -319,8 +412,5 @@ PAPERLESS_CONVERT_BINARY=<path>
|
||||
PAPERLESS_GS_BINARY=<path>
|
||||
Defaults to "/usr/bin/gs".
|
||||
|
||||
PAPERLESS_UNPAPER_BINARY=<path>
|
||||
Defaults to "/usr/bin/unpaper".
|
||||
|
||||
PAPERLESS_OPTIPNG_BINARY=<path>
|
||||
Defaults to "/usr/bin/optipng".
|
||||
|
47
docs/faq.rst
47
docs/faq.rst
@ -3,6 +3,18 @@
|
||||
Frequently asked questions
|
||||
**************************
|
||||
|
||||
**Q:** *What's the general plan for Paperless-ng?*
|
||||
|
||||
**A:** Paperless-ng is already almost feature-complete. This project will remain
|
||||
as simple as it is right now. It will see improvements to features that are already there.
|
||||
If you need advanced features such as document versions,
|
||||
workflows or multi-user with customizable access to individual files, this is
|
||||
not the tool for you.
|
||||
|
||||
Features that *are* planned are some more quality of life extensions for the searching
|
||||
(i.e., search for similar documents, group results by correspondents with "more from this"
|
||||
links, etc), bulk editing and hierarchical tags.
|
||||
|
||||
**Q:** *I'm using docker. Where are my documents?*
|
||||
|
||||
**A:** Your documents are stored inside the docker volume ``paperless_media``.
|
||||
@ -21,11 +33,23 @@ is
|
||||
files around manually. This folder is meant to be entirely managed by docker
|
||||
and paperless.
|
||||
|
||||
**Q:** *Let's say you don't support this project anymore in a year. Can I easily move to other systems?*
|
||||
|
||||
**A:** Your documents are stored as plain files inside the media folder. You can always drag those files
|
||||
out of that folder to use them elsewhere. Here are a couple notes about that.
|
||||
|
||||
* Paperless never modifies your original documents. It keeps checksums of all documents and uses a
|
||||
scheduled sanity checker to check that they remain the same.
|
||||
* By default, paperless uses the internal ID of each document as its filename. This might not be very
|
||||
convenient for export. However, you can adjust the way files are stored in paperless by
|
||||
:ref:`configuring the filename format <advanced-file_name_handling>`.
|
||||
* :ref:`The exporter <utilities-exporter>` is another easy way to get your files out of paperless with reasonable file names.
|
||||
|
||||
**Q:** *What file types does paperless-ng support?*
|
||||
|
||||
**A:** Currently, the following files are supported:
|
||||
|
||||
* PDF documents, PNG images and JPEG images are processed with OCR.
|
||||
* PDF documents, PNG images, JPEG images, TIFF images and GIF images are processed with OCR and converted into PDF documents.
|
||||
* Plain text documents are supported as well and are added verbatim
|
||||
to paperless.
|
||||
|
||||
@ -53,3 +77,24 @@ in your browser and paperless has to do much less work to serve the data.
|
||||
that automatically, I'm all ears. For now, you have to grab the latest release
|
||||
archive from the project page and build the image yourself. The release comes
|
||||
with the front end already compiled, so you don't have to do this on the Pi.
|
||||
|
||||
**Q:** *How do I run this on my toaster?*
|
||||
|
||||
**A:** I honestly don't know! As for all other devices that might be able
|
||||
to run paperless, you're a bit on your own. If you can't run the docker image,
|
||||
the documentation has instructions for bare metal installs. I'm running
|
||||
paperless on an i3 processor from 2015 or so. This is also what I use to test
|
||||
new releases with. Apart from that, I also have a Raspberry Pi, which I
|
||||
occasionally build the image on and see if it works.
|
||||
|
||||
**Q:** *How do I proxy this with NGINX?*
|
||||
|
||||
.. code::
|
||||
|
||||
location / {
|
||||
proxy_pass http://localhost:8000/
|
||||
}
|
||||
|
||||
And that's about it. Paperless serves everything, including static files by itself
|
||||
when running the docker image. If you want to do anything fancy, you have to
|
||||
install paperless bare metal.
|
||||
|
@ -42,6 +42,9 @@ resources in the documentation:
|
||||
learn about how paperless automates all tagging using machine learning.
|
||||
* Paperless now comes with a :ref:`proper email consumer <usage-email>`
|
||||
that's fully tested and production ready.
|
||||
* Paperless creates searchable PDF/A documents from whatever you you put into
|
||||
the consumption directory. This means that you can select text in
|
||||
image-only documents coming from your scanner.
|
||||
* See :ref:`this note <utilities-encyption>` about GnuPG encryption in
|
||||
paperless-ng.
|
||||
* Paperless is now integrated with a
|
||||
|
@ -220,16 +220,25 @@ writing. Windows is not and will never be supported.
|
||||
* ``python3-dev``
|
||||
|
||||
* ``imagemagick`` >= 6 for PDF conversion
|
||||
* ``unpaper`` for cleaning documents before OCR
|
||||
* ``ghostscript``
|
||||
* ``optipng`` for optimising thumbnails
|
||||
* ``tesseract-ocr`` >= 4.0.0 for OCR
|
||||
* ``tesseract-ocr`` language packs (``tesseract-ocr-eng``, ``tesseract-ocr-deu``, etc)
|
||||
* ``gnupg`` for handling encrypted documents
|
||||
* ``libpoppler-cpp-dev`` for PDF to text conversion
|
||||
* ``libmagic-dev`` for mime type detection
|
||||
* ``libpq-dev`` for PostgreSQL
|
||||
|
||||
These dependencies are required for OCRmyPDF, which is used for text recognition.
|
||||
|
||||
* ``unpaper``
|
||||
* ``ghostscript``
|
||||
* ``icc-profiles-free``
|
||||
* ``qpdf``
|
||||
* ``liblept5``
|
||||
* ``libxml2``
|
||||
* ``pngquant``
|
||||
* ``zlib1g``
|
||||
* ``tesseract-ocr`` >= 4.0.0 for OCR
|
||||
* ``tesseract-ocr`` language packs (``tesseract-ocr-eng``, ``tesseract-ocr-deu``, etc)
|
||||
|
||||
You will also need ``build-essential``, ``python3-setuptools`` and ``python3-wheel``
|
||||
for installing some of the python dependencies. You can remove that
|
||||
again after installation.
|
||||
@ -404,7 +413,14 @@ Migration to paperless-ng is then performed in a few simple steps:
|
||||
``docker-compose.env`` to your needs.
|
||||
See `docker route`_ for details on which edits are advised.
|
||||
|
||||
6. In order to find your existing documents with the new search feature, you need
|
||||
6. Since ``docker-compose`` would just use the the old paperless image, we need to
|
||||
manually build a new image:
|
||||
|
||||
.. code:: shell-session
|
||||
|
||||
$ docker-compose build
|
||||
|
||||
7. In order to find your existing documents with the new search feature, you need
|
||||
to invoke a one-time operation that will create the search index:
|
||||
|
||||
.. code:: shell-session
|
||||
@ -414,7 +430,7 @@ Migration to paperless-ng is then performed in a few simple steps:
|
||||
This will migrate your database and create the search index. After that,
|
||||
paperless will take care of maintaining the index by itself.
|
||||
|
||||
7. Start paperless-ng.
|
||||
8. Start paperless-ng.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
@ -422,11 +438,11 @@ Migration to paperless-ng is then performed in a few simple steps:
|
||||
|
||||
This will run paperless in the background and automatically start it on system boot.
|
||||
|
||||
8. Paperless installed a permanent redirect to ``admin/`` in your browser. This
|
||||
9. Paperless installed a permanent redirect to ``admin/`` in your browser. This
|
||||
redirect is still in place and prevents access to the new UI. Clear
|
||||
browsing cache in order to fix this.
|
||||
|
||||
9. Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
|
||||
10. Optionally, follow the instructions below to migrate your existing data to PostgreSQL.
|
||||
|
||||
|
||||
.. _setup-sqlite_to_psql:
|
||||
@ -545,12 +561,10 @@ configuring some options in paperless can help improve performance immensely:
|
||||
sluggish response times during consumption, so you might want to lower these
|
||||
settings (example: 2 workers and 1 thread to always have some computing power
|
||||
left for other tasks).
|
||||
* Keep ``PAPERLESS_OCR_ALWAYS`` at its default value 'false' and consider OCR'ing
|
||||
* Keep ``PAPERLESS_OCR_MODE`` at its default value ``skip`` and consider OCR'ing
|
||||
your documents before feeding them into paperless. Some scanners are able to
|
||||
do this!
|
||||
* Lower ``PAPERLESS_CONVERT_DENSITY`` from its default value 300 to 200. This
|
||||
will still result in rather accurate OCR, but will decrease consumption time
|
||||
by quite a bit.
|
||||
do this! You might want to even specify ``skip_noarchive`` to skip archive
|
||||
file generation for already ocr'ed documents entirely.
|
||||
* Set ``PAPERLESS_OPTIMIZE_THUMBNAILS`` to 'false' if you want faster consumption
|
||||
times. Thumbnails will be about 20% larger.
|
||||
|
||||
|
@ -29,75 +29,23 @@ Check for the following issues:
|
||||
Consumer fails to pickup any new files
|
||||
######################################
|
||||
|
||||
If you notice, that the consumer will only pickup files in the consumption
|
||||
If you notice that the consumer will only pickup files in the consumption
|
||||
directory at startup, but won't find any other files added later, check out
|
||||
the configuration file and enable filesystem polling with the setting
|
||||
``PAPERLESS_CONSUMER_POLLING``.
|
||||
|
||||
Operation not permitted
|
||||
#######################
|
||||
|
||||
Consumer warns ``OCR for XX failed``
|
||||
####################################
|
||||
You might see errors such as:
|
||||
|
||||
If you find the OCR accuracy to be too low, and/or the document consumer warns
|
||||
that ``OCR for XX failed, but we're going to stick with what we've got since
|
||||
FORGIVING_OCR is enabled``, then you might need to install the
|
||||
`Tesseract language files <http://packages.ubuntu.com/search?keywords=tesseract-ocr>`_
|
||||
marching your document's languages.
|
||||
.. code::
|
||||
|
||||
As an example, if you are running Paperless from any Ubuntu or Debian
|
||||
box, and your documents are written in Spanish you may need to run::
|
||||
chown: changing ownership of '../export': Operation not permitted
|
||||
|
||||
apt-get install -y tesseract-ocr-spa
|
||||
The container tries to set file ownership on the listed directories. This is
|
||||
required so that the user running paperless inside docker has write permissions
|
||||
to these folders. This happens when pointing these directories to NFS shares,
|
||||
for example.
|
||||
|
||||
|
||||
|
||||
Consumer dies with ``convert: unable to extent pixel cache``
|
||||
############################################################
|
||||
|
||||
During the consumption process, Paperless invokes ImageMagick's ``convert``
|
||||
program to translate the source document into something that the OCR engine can
|
||||
understand and this can burn a Very Large amount of memory if the original
|
||||
document is rather long. Similarly, if your system doesn't have a lot of
|
||||
memory to begin with (ie. a Raspberry Pi), then this can happen for even
|
||||
medium-sized documents.
|
||||
|
||||
The solution is to tell ImageMagick *not* to Use All The RAM, as is its
|
||||
default, and instead tell it to used a fixed amount. ``convert`` will then
|
||||
break up the job into hundreds of individual files and use them to slowly
|
||||
compile the finished image. Simply set ``PAPERLESS_CONVERT_MEMORY_LIMIT`` in
|
||||
``/etc/paperless.conf`` to something like ``32000000`` and you'll limit
|
||||
``convert`` to 32MB. Fiddle with this value as you like.
|
||||
|
||||
**HOWEVER**: Simply setting this value may not be enough on system where
|
||||
``/tmp`` is mounted as tmpfs, as this is where ``convert`` will write its
|
||||
temporary files. In these cases (most Systemd machines), you need to tell
|
||||
ImageMagick to use a different space for its scratch work. You do this by
|
||||
setting ``PAPERLESS_CONVERT_TMPDIR`` in ``/etc/paperless.conf`` to somewhere
|
||||
that's actually on a physical disk (and writable by the user running
|
||||
Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch.
|
||||
|
||||
|
||||
DecompressionBombWarning and/or no text in the OCR output
|
||||
#########################################################
|
||||
|
||||
Some users have had issues using Paperless to consume PDFs that were created
|
||||
by merging Very Large Scanned Images into one PDF. If this happens to you,
|
||||
it's likely because the PDF you've created contains some very large pages
|
||||
(millions of pixels) and the process of converting the PDF to a OCR-friendly
|
||||
image is exploding.
|
||||
|
||||
Typically, this happens because the scanned images are created with a high
|
||||
DPI and then rolled into the PDF with an assumed DPI of 72 (the default).
|
||||
The best solution then is to specify the DPI used in the scan in the
|
||||
conversion-to-PDF step. So for example, if you scanned the original image
|
||||
with a DPI of 300, then merging the images into the single PDF with
|
||||
``convert`` should look like this:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ convert -density 300 *.jpg finished.pdf
|
||||
|
||||
For more information on this and situations like it, you should take a look
|
||||
at `Issue #118`_ as that's where this tip originated.
|
||||
|
||||
.. _Issue #118: https://github.com/the-paperless-project/paperless/issues/118
|
||||
Ensure that `chown` is possible on these directories.
|
||||
|
@ -60,6 +60,31 @@ Once you've got Paperless setup, you need to start feeding documents into it.
|
||||
Currently, there are three options: the consumption directory, IMAP (email), and
|
||||
HTTP POST.
|
||||
|
||||
When adding documents to paperless, it will perform the following operations on
|
||||
your documents:
|
||||
|
||||
1. OCR the document, if it has no text. Digital documents usually have text,
|
||||
and this step will be skipped for those documents.
|
||||
2. Paperless will create an archiveable PDF/A document from your document.
|
||||
If this document is coming from your scanner, it will have embedded selectable text.
|
||||
3. Paperless performs automatic matching of tags, correspondents and types on the
|
||||
document before storing it in the database.
|
||||
|
||||
.. hint::
|
||||
|
||||
This process can be configured to fit your needs. If you don't want paperless
|
||||
to create archived versions for digital documents, you can configure that by
|
||||
configuring ``PAPERLESS_OCR_MODE=skip_noarchive``. Please read the
|
||||
:ref:`relevant section in the documentation <configuration-ocr>`.
|
||||
|
||||
.. note::
|
||||
|
||||
No matter which options you choose, Paperless will always store the original
|
||||
document that it found in the consumption directory or in the mail and
|
||||
will never overwrite that document. Archived versions are stored alongside the
|
||||
digital versions.
|
||||
|
||||
|
||||
|
||||
The consumption directory
|
||||
=========================
|
||||
@ -156,6 +181,62 @@ REST API
|
||||
|
||||
You can also submit a document using the REST API, see :ref:`api-file_uploads` for details.
|
||||
|
||||
.. _basic-searching:
|
||||
|
||||
Searching
|
||||
#########
|
||||
|
||||
Paperless offers an extensive searching mechanism that is designed to allow you to quickly
|
||||
find a document you're looking for (for example, that thing that just broke and you bought
|
||||
a couple months ago, that contract you signed 8 years ago).
|
||||
|
||||
When you search paperless for a document, it tries to match this query against your documents.
|
||||
Paperless will look for matching documents by inspecting their content, title, correspondent,
|
||||
type and tags. Paperless returns a scored list of results, so that documents matching your query
|
||||
better will appear further up in the search results.
|
||||
|
||||
By default, paperless returns only documents which contain all words typed in the search bar.
|
||||
However, paperless also offers advanced search syntax if you want to drill down the results
|
||||
further.
|
||||
|
||||
Matching documents with logical expressions:
|
||||
|
||||
.. code::
|
||||
|
||||
shopname AND (product1 OR product2)
|
||||
|
||||
Matching specific tags, correspondents or types:
|
||||
|
||||
.. code::
|
||||
|
||||
type:invoice tag:unpaid
|
||||
correspondent:university certificate
|
||||
|
||||
Matching dates:
|
||||
|
||||
.. code::
|
||||
|
||||
created:[2005 to 2009]
|
||||
added:yesterday
|
||||
modified:today
|
||||
|
||||
Matching inexact words:
|
||||
|
||||
.. code::
|
||||
|
||||
produ*name
|
||||
|
||||
.. note::
|
||||
|
||||
Inexact terms are hard for search indexes. These queries might take a while to execute. That's why paperless offers
|
||||
auto complete and query correction.
|
||||
|
||||
All of these constructs can be combined as you see fit.
|
||||
If you want to learn more about the query language used by paperless, paperless uses Whoosh's default query language.
|
||||
Head over to `Whoosh query language <https://whoosh.readthedocs.io/en/latest/querylang.html>`_.
|
||||
For details on what date parsing utilities are available, see
|
||||
`Date parsing <https://whoosh.readthedocs.io/en/latest/dates.html#parsing-date-queries>`_.
|
||||
|
||||
|
||||
.. _usage-recommended_workflow:
|
||||
|
||||
|
@ -31,19 +31,24 @@
|
||||
#PAPERLESS_STATIC_URL=/static/
|
||||
#PAPERLESS_AUTO_LOGIN_USERNAME=
|
||||
|
||||
# OCR settings
|
||||
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
#PAPERLESS_OCR_MODE=skip
|
||||
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
|
||||
#PAPERLESS_OCR_PAGES=1
|
||||
#PAPERLESS_OCR_IMAGE_DPI=300
|
||||
#PAPERLESS_OCR_USER_ARG={}
|
||||
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
|
||||
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||
|
||||
# Software tweaks
|
||||
|
||||
#PAPERLESS_TASK_WORKERS=1
|
||||
#PAPERLESS_THREADS_PER_WORKER=1
|
||||
#PAPERLESS_TIME_ZONE=UTC
|
||||
#PAPERLESS_OCR_PAGES=1
|
||||
#PAPERLESS_OCR_LANGUAGE=eng
|
||||
#PAPERLESS_OCR_ALWAYS=false
|
||||
#PAPERLESS_CONSUMER_POLLING=10
|
||||
#PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
|
||||
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
|
||||
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
|
||||
#PAPERLESS_CONVERT_DENSITY=300
|
||||
#PAPERLESS_OPTIMIZE_THUMBNAILS=true
|
||||
#PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
|
||||
#PAPERLESS_FILENAME_DATE_ORDER=YMD
|
||||
@ -53,5 +58,4 @@
|
||||
|
||||
#PAPERLESS_CONVERT_BINARY=/usr/bin/convert
|
||||
#PAPERLESS_GS_BINARY=/usr/bin/gs
|
||||
#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper
|
||||
#PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng
|
||||
|
@ -45,6 +45,7 @@ import { SavedViewWidgetComponent } from './components/dashboard/widgets/saved-v
|
||||
import { StatisticsWidgetComponent } from './components/dashboard/widgets/statistics-widget/statistics-widget.component';
|
||||
import { UploadFileWidgetComponent } from './components/dashboard/widgets/upload-file-widget/upload-file-widget.component';
|
||||
import { WidgetFrameComponent } from './components/dashboard/widgets/widget-frame/widget-frame.component';
|
||||
import { WelcomeWidgetComponent } from './components/dashboard/widgets/welcome-widget/welcome-widget.component';
|
||||
|
||||
@NgModule({
|
||||
declarations: [
|
||||
@ -82,7 +83,8 @@ import { WidgetFrameComponent } from './components/dashboard/widgets/widget-fram
|
||||
SavedViewWidgetComponent,
|
||||
StatisticsWidgetComponent,
|
||||
UploadFileWidgetComponent,
|
||||
WidgetFrameComponent
|
||||
WidgetFrameComponent,
|
||||
WelcomeWidgetComponent
|
||||
],
|
||||
imports: [
|
||||
BrowserModule,
|
||||
|
@ -50,6 +50,7 @@
|
||||
|
||||
.sidebar .nav-link.active {
|
||||
color: $primary;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.sidebar .nav-link:hover .sidebaricon,
|
||||
|
@ -90,7 +90,9 @@ export class AppFrameComponent implements OnInit, OnDestroy {
|
||||
}
|
||||
|
||||
ngOnDestroy() {
|
||||
this.openDocumentsSubscription.unsubscribe()
|
||||
if (this.openDocumentsSubscription) {
|
||||
this.openDocumentsSubscription.unsubscribe()
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -3,23 +3,19 @@
|
||||
</app-page-header>
|
||||
|
||||
<div class='row'>
|
||||
<div class="col-lg">
|
||||
<app-widget-frame title="Saved views" *ngIf="savedViews.length == 0">
|
||||
<p class="card-text">This space is reserved to display your saved views. Go to your documents and save a view
|
||||
to have it displayed
|
||||
here!</p>
|
||||
</app-widget-frame>
|
||||
<div class="col-lg-8">
|
||||
<app-welcome-widget *ngIf="savedViews.length == 0"></app-welcome-widget>
|
||||
|
||||
<ng-container *ngFor="let v of savedViews">
|
||||
<app-saved-view-widget [savedView]="v"></app-saved-view-widget>
|
||||
</ng-container>
|
||||
|
||||
</div>
|
||||
<div class="col-lg">
|
||||
<div class="col-lg-4">
|
||||
|
||||
<app-statistics-widget></app-statistics-widget>
|
||||
|
||||
<app-upload-file-widget></app-upload-file-widget>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -1,6 +1,9 @@
|
||||
<app-widget-frame [title]="savedView.title">
|
||||
|
||||
<table class="table table-sm table-hover table-borderless">
|
||||
<a header-buttons [routerLink]="" (click)="showAll()">Show all</a>
|
||||
|
||||
|
||||
<table content class="table table-sm table-hover table-borderless">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Created</th>
|
||||
@ -10,7 +13,7 @@
|
||||
<tbody>
|
||||
<tr *ngFor="let doc of documents" routerLink="/documents/{{doc.id}}">
|
||||
<td>{{doc.created | date}}</td>
|
||||
<td>{{doc.title}}<app-tag [tag]="t" *ngFor="let t of doc.tags" class="ml-1"></app-tag>
|
||||
<td>{{doc.title}}<app-tag [tag]="t" *ngFor="let t of doc.tags$ | async" class="ml-1"></app-tag>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
@ -1,6 +1,8 @@
|
||||
import { Component, Input, OnInit } from '@angular/core';
|
||||
import { Router } from '@angular/router';
|
||||
import { PaperlessDocument } from 'src/app/data/paperless-document';
|
||||
import { SavedViewConfig } from 'src/app/data/saved-view-config';
|
||||
import { DocumentListViewService } from 'src/app/services/document-list-view.service';
|
||||
import { DocumentService } from 'src/app/services/rest/document.service';
|
||||
|
||||
@Component({
|
||||
@ -10,7 +12,10 @@ import { DocumentService } from 'src/app/services/rest/document.service';
|
||||
})
|
||||
export class SavedViewWidgetComponent implements OnInit {
|
||||
|
||||
constructor(private documentService: DocumentService) { }
|
||||
constructor(
|
||||
private documentService: DocumentService,
|
||||
private router: Router,
|
||||
private list: DocumentListViewService) { }
|
||||
|
||||
@Input()
|
||||
savedView: SavedViewConfig
|
||||
@ -23,4 +28,9 @@ export class SavedViewWidgetComponent implements OnInit {
|
||||
})
|
||||
}
|
||||
|
||||
showAll() {
|
||||
this.list.load(this.savedView)
|
||||
this.router.navigate(["documents"])
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,4 +1,6 @@
|
||||
<app-widget-frame title="Statistics">
|
||||
<p class="card-text">Documents in inbox: {{statistics.documents_inbox}}</p>
|
||||
<p class="card-text">Total documents: {{statistics.documents_total}}</p>
|
||||
<ng-container content>
|
||||
<p class="card-text">Documents in inbox: {{statistics.documents_inbox}}</p>
|
||||
<p class="card-text">Total documents: {{statistics.documents_total}}</p>
|
||||
</ng-container>
|
||||
</app-widget-frame>
|
@ -1,6 +1,6 @@
|
||||
<app-widget-frame title="Upload new documents">
|
||||
|
||||
<form>
|
||||
<form content>
|
||||
<ngx-file-drop
|
||||
dropZoneLabel="Drop documents here or" (onFileDrop)="dropped($event)"
|
||||
(onFileOver)="fileOver($event)" (onFileLeave)="fileLeave($event)"
|
||||
|
@ -0,0 +1,16 @@
|
||||
<app-widget-frame title="First steps">
|
||||
|
||||
<ng-container content>
|
||||
<img src="assets/save-filter.png" class="float-right">
|
||||
<p>Paperless is running! :)</p>
|
||||
<p>You can start uploading documents by dropping them in the file upload box to the right or by dropping them in the configured consumption folder and they'll start showing up in the documents list.
|
||||
After you've added some metadata to your documents, use the filtering mechanisms of paperless to create custom views (such as 'Recently added', 'Tagged TODO') and have them displayed on the dashboard instead of this message.</p>
|
||||
<p>Paperless offers some more features that try to make your life easier, such as:</p>
|
||||
<ul>
|
||||
<li>Once you've got a couple documents in paperless and added metadata to them, paperless can assign that metadata to new documents automatically.</li>
|
||||
<li>You can configure paperless to read your mails and add documents from attached files.</li>
|
||||
</ul>
|
||||
<p>Consult the documentation on how to use these features. The section on basic usage also has some information on how to use paperless in general.</p>
|
||||
</ng-container>
|
||||
|
||||
</app-widget-frame>
|
@ -0,0 +1,25 @@
|
||||
import { ComponentFixture, TestBed } from '@angular/core/testing';
|
||||
|
||||
import { WelcomeWidgetComponent } from './welcome-widget.component';
|
||||
|
||||
describe('WelcomeWidgetComponent', () => {
|
||||
let component: WelcomeWidgetComponent;
|
||||
let fixture: ComponentFixture<WelcomeWidgetComponent>;
|
||||
|
||||
beforeEach(async () => {
|
||||
await TestBed.configureTestingModule({
|
||||
declarations: [ WelcomeWidgetComponent ]
|
||||
})
|
||||
.compileComponents();
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
fixture = TestBed.createComponent(WelcomeWidgetComponent);
|
||||
component = fixture.componentInstance;
|
||||
fixture.detectChanges();
|
||||
});
|
||||
|
||||
it('should create', () => {
|
||||
expect(component).toBeTruthy();
|
||||
});
|
||||
});
|
@ -0,0 +1,15 @@
|
||||
import { Component, OnInit } from '@angular/core';
|
||||
|
||||
@Component({
|
||||
selector: 'app-welcome-widget',
|
||||
templateUrl: './welcome-widget.component.html',
|
||||
styleUrls: ['./welcome-widget.component.scss']
|
||||
})
|
||||
export class WelcomeWidgetComponent implements OnInit {
|
||||
|
||||
constructor() { }
|
||||
|
||||
ngOnInit(): void {
|
||||
}
|
||||
|
||||
}
|
@ -1,8 +1,12 @@
|
||||
<div class="card mb-3 shadow">
|
||||
<div class="card-header">
|
||||
<h5 class="card-title mb-0">{{title}}</h5>
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<h5 class="card-title mb-0">{{title}}</h5>
|
||||
<ng-content select ="[header-buttons]"></ng-content>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div class="card-body text-dark">
|
||||
<ng-content></ng-content>
|
||||
<ng-content select ="[content]"></ng-content>
|
||||
</div>
|
||||
</div>
|
@ -5,12 +5,26 @@
|
||||
</svg>
|
||||
<span class="d-none d-lg-inline"> Delete</span>
|
||||
</button>
|
||||
<a [href]="downloadUrl" class="btn btn-sm btn-outline-primary mr-2">
|
||||
<svg class="buttonicon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#download" />
|
||||
</svg>
|
||||
<span class="d-none d-lg-inline"> Download</span>
|
||||
</a>
|
||||
|
||||
<div class="btn-group mr-2">
|
||||
|
||||
<a [href]="downloadUrl" class="btn btn-sm btn-outline-primary">
|
||||
<svg class="buttonicon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#download" />
|
||||
</svg>
|
||||
<span class="d-none d-lg-inline"> Download</span>
|
||||
</a>
|
||||
|
||||
<div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version">
|
||||
<button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button>
|
||||
<div class="dropdown-menu" ngbDropdownMenu>
|
||||
<a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<button type="button" class="btn btn-sm btn-outline-primary" (click)="close()">
|
||||
<svg class="buttonicon" fill="currentColor">
|
||||
<use xlink:href="assets/bootstrap-icons.svg#x" />
|
||||
@ -39,11 +53,11 @@
|
||||
<textarea class="form-control" id="content" rows="5" formControlName='content'></textarea>
|
||||
</div>
|
||||
|
||||
<app-input-select [items]="correspondents" title="Correspondent" formControlName="correspondent_id" allowNull="true" (createNew)="createCorrespondent()"></app-input-select>
|
||||
<app-input-select [items]="correspondents" title="Correspondent" formControlName="correspondent" allowNull="true" (createNew)="createCorrespondent()"></app-input-select>
|
||||
|
||||
<app-input-select [items]="documentTypes" title="Document type" formControlName="document_type_id" allowNull="true" (createNew)="createDocumentType()"></app-input-select>
|
||||
<app-input-select [items]="documentTypes" title="Document type" formControlName="document_type" allowNull="true" (createNew)="createDocumentType()"></app-input-select>
|
||||
|
||||
<app-input-tags formControlName="tags_id" title="Tags"></app-input-tags>
|
||||
<app-input-tags formControlName="tags" title="Tags"></app-input-tags>
|
||||
|
||||
<button type="button" class="btn btn-outline-secondary" (click)="discard()">Discard</button>
|
||||
<button type="button" class="btn btn-outline-primary" (click)="saveEditNext()" *ngIf="hasNext()">Save & edit next</button>
|
||||
|
@ -4,6 +4,7 @@ import { ActivatedRoute, Router } from '@angular/router';
|
||||
import { NgbModal } from '@ng-bootstrap/ng-bootstrap';
|
||||
import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent';
|
||||
import { PaperlessDocument } from 'src/app/data/paperless-document';
|
||||
import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata';
|
||||
import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
|
||||
import { DocumentListViewService } from 'src/app/services/document-list-view.service';
|
||||
import { OpenDocumentsService } from 'src/app/services/open-documents.service';
|
||||
@ -23,9 +24,11 @@ export class DocumentDetailComponent implements OnInit {
|
||||
|
||||
documentId: number
|
||||
document: PaperlessDocument
|
||||
metadata: PaperlessDocumentMetadata
|
||||
title: string
|
||||
previewUrl: string
|
||||
downloadUrl: string
|
||||
downloadOriginalUrl: string
|
||||
|
||||
correspondents: PaperlessCorrespondent[]
|
||||
documentTypes: PaperlessDocumentType[]
|
||||
@ -34,10 +37,10 @@ export class DocumentDetailComponent implements OnInit {
|
||||
title: new FormControl(''),
|
||||
content: new FormControl(''),
|
||||
created: new FormControl(),
|
||||
correspondent_id: new FormControl(),
|
||||
document_type_id: new FormControl(),
|
||||
correspondent: new FormControl(),
|
||||
document_type: new FormControl(),
|
||||
archive_serial_number: new FormControl(),
|
||||
tags_id: new FormControl([])
|
||||
tags: new FormControl([])
|
||||
})
|
||||
|
||||
constructor(
|
||||
@ -62,6 +65,7 @@ export class DocumentDetailComponent implements OnInit {
|
||||
this.documentId = +paramMap.get('id')
|
||||
this.previewUrl = this.documentsService.getPreviewUrl(this.documentId)
|
||||
this.downloadUrl = this.documentsService.getDownloadUrl(this.documentId)
|
||||
this.downloadOriginalUrl = this.documentsService.getDownloadUrl(this.documentId, true)
|
||||
if (this.openDocumentService.getOpenDocument(this.documentId)) {
|
||||
this.updateComponent(this.openDocumentService.getOpenDocument(this.documentId))
|
||||
} else {
|
||||
@ -76,6 +80,9 @@ export class DocumentDetailComponent implements OnInit {
|
||||
|
||||
updateComponent(doc: PaperlessDocument) {
|
||||
this.document = doc
|
||||
this.documentsService.getMetadata(doc.id).subscribe(result => {
|
||||
this.metadata = result
|
||||
})
|
||||
this.title = doc.title
|
||||
this.documentForm.patchValue(doc)
|
||||
}
|
||||
@ -86,7 +93,7 @@ export class DocumentDetailComponent implements OnInit {
|
||||
modal.componentInstance.success.subscribe(newDocumentType => {
|
||||
this.documentTypeService.listAll().subscribe(documentTypes => {
|
||||
this.documentTypes = documentTypes.results
|
||||
this.documentForm.get('document_type_id').setValue(newDocumentType.id)
|
||||
this.documentForm.get('document_type').setValue(newDocumentType.id)
|
||||
})
|
||||
})
|
||||
}
|
||||
@ -97,7 +104,7 @@ export class DocumentDetailComponent implements OnInit {
|
||||
modal.componentInstance.success.subscribe(newCorrespondent => {
|
||||
this.correspondentService.listAll().subscribe(correspondents => {
|
||||
this.correspondents = correspondents.results
|
||||
this.documentForm.get('correspondent_id').setValue(newCorrespondent.id)
|
||||
this.documentForm.get('correspondent').setValue(newCorrespondent.id)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
@ -9,9 +9,11 @@
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<h5 class="card-title">
|
||||
<ng-container *ngIf="document.correspondent">
|
||||
<a [routerLink]="" title="Filter by correspondent" (click)="clickCorrespondent.emit(document.correspondent)" class="font-weight-bold">{{document.correspondent.name}}</a>:
|
||||
<a *ngIf="clickCorrespondent.observers.length ; else nolink" [routerLink]="" title="Filter by correspondent" (click)="clickCorrespondent.emit(document.correspondent)" class="font-weight-bold">{{(document.correspondent$ | async)?.name}}</a>
|
||||
<ng-template #nolink>{{(document.correspondent$ | async)?.name}}</ng-template>:
|
||||
</ng-container>
|
||||
{{document.title}}<app-tag [tag]="t" linkTitle="Filter by tag" *ngFor="let t of document.tags" class="ml-1" (click)="clickTag.emit(t)" [clickable]="true"></app-tag>
|
||||
{{document.title}}
|
||||
<app-tag [tag]="t" linkTitle="Filter by tag" *ngFor="let t of document.tags$ | async" class="ml-1" (click)="clickTag.emit(t.id)" [clickable]="clickTag.observers.length"></app-tag>
|
||||
</h5>
|
||||
<h5 class="card-title" *ngIf="document.archive_serial_number">#{{document.archive_serial_number}}</h5>
|
||||
</div>
|
||||
|
@ -20,10 +20,10 @@ export class DocumentCardLargeComponent implements OnInit {
|
||||
details: any
|
||||
|
||||
@Output()
|
||||
clickTag = new EventEmitter<PaperlessTag>()
|
||||
clickTag = new EventEmitter<number>()
|
||||
|
||||
@Output()
|
||||
clickCorrespondent = new EventEmitter<PaperlessDocument>()
|
||||
clickCorrespondent = new EventEmitter<number>()
|
||||
|
||||
ngOnInit(): void {
|
||||
}
|
||||
|
@ -1,15 +1,15 @@
|
||||
<div class="col p-2 h-100" style="width: 16rem;">
|
||||
<div class="card h-100 shadow-sm">
|
||||
<div class=" border-bottom doc-img pr-1" [ngStyle]="{'background-image': 'url(' + getThumbUrl() + ')'}">
|
||||
<div class="row" *ngFor="let t of document.tags">
|
||||
<app-tag style="font-size: large;" [tag]="t" class="col text-right" (click)="clickTag.emit(t)" [clickable]="true" linkTitle="Filter by tag"></app-tag>
|
||||
<div class="row" *ngFor="let t of document.tags$ | async">
|
||||
<app-tag style="font-size: large;" [tag]="t" class="col text-right" (click)="clickTag.emit(t.id)" [clickable]="true" linkTitle="Filter by tag"></app-tag>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card-body p-2">
|
||||
<p class="card-text">
|
||||
<ng-container *ngIf="document.correspondent">
|
||||
<a [routerLink]="" title="Filter by correspondent" (click)="clickCorrespondent.emit(document.correspondent)" class="font-weight-bold">{{document.correspondent.name}}</a>:
|
||||
<a [routerLink]="" title="Filter by correspondent" (click)="clickCorrespondent.emit(document.correspondent)" class="font-weight-bold">{{(document.correspondent$ | async)?.name}}</a>:
|
||||
</ng-container>
|
||||
{{document.title}}
|
||||
</p>
|
||||
|
@ -16,10 +16,10 @@ export class DocumentCardSmallComponent implements OnInit {
|
||||
document: PaperlessDocument
|
||||
|
||||
@Output()
|
||||
clickTag = new EventEmitter<PaperlessTag>()
|
||||
clickTag = new EventEmitter<number>()
|
||||
|
||||
@Output()
|
||||
clickCorrespondent = new EventEmitter<PaperlessDocument>()
|
||||
clickCorrespondent = new EventEmitter<number>()
|
||||
|
||||
ngOnInit(): void {
|
||||
}
|
||||
|
@ -96,11 +96,12 @@
|
||||
<div class="card w-100 mb-3" [hidden]="!showFilter">
|
||||
<div class="card-body">
|
||||
<h5 class="card-title">Filter</h5>
|
||||
<app-filter-editor [(filterRules)]="filterRules" (apply)="applyFilterRules()"></app-filter-editor>
|
||||
<app-filter-editor [(filterRules)]="filterRules" (apply)="applyFilterRules()" (clear)="clearFilterRules()"></app-filter-editor>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row m-0 justify-content-end">
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<p>{{list.collectionSize || 0}} document(s)</p>
|
||||
<ngb-pagination [pageSize]="list.currentPageSize" [collectionSize]="list.collectionSize" [(page)]="list.currentPage" [maxSize]="5"
|
||||
[rotate]="true" (pageChange)="list.reload()" aria-label="Default pagination"></ngb-pagination>
|
||||
</div>
|
||||
@ -126,16 +127,16 @@
|
||||
</td>
|
||||
<td class="d-none d-md-table-cell">
|
||||
<ng-container *ngIf="d.correspondent">
|
||||
<a [routerLink]="" (click)="filterByCorrespondent(d.correspondent)" title="Filter by correspondent">{{d.correspondent.name}}</a>
|
||||
<a [routerLink]="" (click)="filterByCorrespondent(d.correspondent)" title="Filter by correspondent">{{(d.correspondent$ | async)?.name}}</a>
|
||||
</ng-container>
|
||||
</td>
|
||||
<td>
|
||||
<a routerLink="/documents/{{d.id}}" title="Edit document">{{d.title}}</a>
|
||||
<app-tag [tag]="t" *ngFor="let t of d.tags" class="ml-1" clickable="true" linkTitle="Filter by tag" (click)="filterByTag(t)"></app-tag>
|
||||
<app-tag [tag]="t" *ngFor="let t of d.tags$ | async" class="ml-1" clickable="true" linkTitle="Filter by tag" (click)="filterByTag(t.id)"></app-tag>
|
||||
</td>
|
||||
<td class="d-none d-xl-table-cell">
|
||||
<ng-container *ngIf="d.document_type">
|
||||
<a [routerLink]="" (click)="filterByDocumentType(d.document_type)" title="Filter by document type">{{d.document_type.name}}</a>
|
||||
<a [routerLink]="" (click)="filterByDocumentType(d.document_type)" title="Filter by document type">{{(d.document_type$ | async)?.name}}</a>
|
||||
</ng-container>
|
||||
</td>
|
||||
<td>
|
||||
@ -152,5 +153,3 @@
|
||||
<div class=" m-n2 row" *ngIf="displayMode == 'smallCards'">
|
||||
<app-document-card-small [document]="d" *ngFor="let d of list.documents" (clickTag)="filterByTag($event)" (clickCorrespondent)="filterByCorrespondent($event)"></app-document-card-small>
|
||||
</div>
|
||||
|
||||
<p *ngIf="list.documents.length == 0" class="mx-auto">No results</p>
|
||||
|
@ -3,9 +3,6 @@ import { ActivatedRoute } from '@angular/router';
|
||||
import { NgbModal } from '@ng-bootstrap/ng-bootstrap';
|
||||
import { cloneFilterRules, FilterRule } from 'src/app/data/filter-rule';
|
||||
import { FILTER_CORRESPONDENT, FILTER_DOCUMENT_TYPE, FILTER_HAS_TAG, FILTER_RULE_TYPES } from 'src/app/data/filter-rule-type';
|
||||
import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent';
|
||||
import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
|
||||
import { PaperlessTag } from 'src/app/data/paperless-tag';
|
||||
import { SavedViewConfig } from 'src/app/data/saved-view-config';
|
||||
import { DocumentListViewService } from 'src/app/services/document-list-view.service';
|
||||
import { DOCUMENT_SORT_FIELDS } from 'src/app/services/rest/document.service';
|
||||
@ -51,13 +48,14 @@ export class DocumentListComponent implements OnInit {
|
||||
this.route.paramMap.subscribe(params => {
|
||||
if (params.has('id')) {
|
||||
this.list.savedView = this.savedViewConfigService.getConfig(params.get('id'))
|
||||
this.filterRules = this.list.filterRules
|
||||
this.showFilter = false
|
||||
} else {
|
||||
this.list.savedView = null
|
||||
this.filterRules = this.list.filterRules
|
||||
this.showFilter = this.filterRules.length > 0
|
||||
}
|
||||
this.filterRules = this.list.filterRules
|
||||
//this.showFilter = this.filterRules.length > 0
|
||||
// prevents temporarily visible results from previous views
|
||||
this.list.documents = []
|
||||
this.list.clear()
|
||||
this.list.reload()
|
||||
})
|
||||
}
|
||||
@ -66,6 +64,11 @@ export class DocumentListComponent implements OnInit {
|
||||
this.list.filterRules = this.filterRules
|
||||
}
|
||||
|
||||
clearFilterRules() {
|
||||
this.list.filterRules = this.filterRules
|
||||
this.showFilter = false
|
||||
}
|
||||
|
||||
loadViewConfig(config: SavedViewConfig) {
|
||||
this.filterRules = cloneFilterRules(config.filterRules)
|
||||
this.list.load(config)
|
||||
@ -91,32 +94,42 @@ export class DocumentListComponent implements OnInit {
|
||||
})
|
||||
}
|
||||
|
||||
filterByTag(t: PaperlessTag) {
|
||||
if (this.filterRules.find(rule => rule.type.id == FILTER_HAS_TAG && rule.value == t.id)) {
|
||||
filterByTag(tag_id: number) {
|
||||
let filterRules = this.list.filterRules
|
||||
if (filterRules.find(rule => rule.type.id == FILTER_HAS_TAG && rule.value == tag_id)) {
|
||||
return
|
||||
}
|
||||
|
||||
this.filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_HAS_TAG), value: t.id})
|
||||
filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_HAS_TAG), value: tag_id})
|
||||
this.filterRules = filterRules
|
||||
this.applyFilterRules()
|
||||
}
|
||||
|
||||
filterByCorrespondent(c: PaperlessCorrespondent) {
|
||||
let existing_rule = this.filterRules.find(rule => rule.type.id == FILTER_CORRESPONDENT)
|
||||
if (existing_rule) {
|
||||
existing_rule.value = c.id
|
||||
filterByCorrespondent(correspondent_id: number) {
|
||||
let filterRules = this.list.filterRules
|
||||
let existing_rule = filterRules.find(rule => rule.type.id == FILTER_CORRESPONDENT)
|
||||
if (existing_rule && existing_rule.value == correspondent_id) {
|
||||
return
|
||||
} else if (existing_rule) {
|
||||
existing_rule.value = correspondent_id
|
||||
} else {
|
||||
this.filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_CORRESPONDENT), value: c.id})
|
||||
filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_CORRESPONDENT), value: correspondent_id})
|
||||
}
|
||||
this.filterRules = filterRules
|
||||
this.applyFilterRules()
|
||||
}
|
||||
|
||||
filterByDocumentType(dt: PaperlessDocumentType) {
|
||||
let existing_rule = this.filterRules.find(rule => rule.type.id == FILTER_DOCUMENT_TYPE)
|
||||
if (existing_rule) {
|
||||
existing_rule.value = dt.id
|
||||
filterByDocumentType(document_type_id: number) {
|
||||
let filterRules = this.list.filterRules
|
||||
let existing_rule = filterRules.find(rule => rule.type.id == FILTER_DOCUMENT_TYPE)
|
||||
if (existing_rule && existing_rule.value == document_type_id) {
|
||||
return
|
||||
} else if (existing_rule) {
|
||||
existing_rule.value = document_type_id
|
||||
} else {
|
||||
this.filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_DOCUMENT_TYPE), value: dt.id})
|
||||
filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_DOCUMENT_TYPE), value: document_type_id})
|
||||
}
|
||||
this.filterRules = filterRules
|
||||
this.applyFilterRules()
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,9 @@ export class FilterEditorComponent implements OnInit {
|
||||
|
||||
constructor(private documentTypeService: DocumentTypeService, private tagService: TagService, private correspondentService: CorrespondentService) { }
|
||||
|
||||
@Output()
|
||||
clear = new EventEmitter()
|
||||
|
||||
@Input()
|
||||
filterRules: FilterRule[] = []
|
||||
|
||||
@ -48,7 +51,7 @@ export class FilterEditorComponent implements OnInit {
|
||||
|
||||
clearClicked() {
|
||||
this.filterRules.splice(0,this.filterRules.length)
|
||||
this.apply.next()
|
||||
this.clear.next()
|
||||
}
|
||||
|
||||
ngOnInit(): void {
|
||||
|
@ -1,13 +1,21 @@
|
||||
<app-page-header title="Search results">
|
||||
</app-page-header>
|
||||
|
||||
<p>Search string: <i>{{query}}</i></p>
|
||||
<div *ngIf="errorMessage" class="alert alert-danger">Invalid search query: {{errorMessage}}</div>
|
||||
|
||||
<div [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()">
|
||||
<p>
|
||||
Search string: <i>{{query}}</i>
|
||||
<ng-container *ngIf="correctedQuery">
|
||||
- Did you mean "<a [routerLink]="" (click)="searchCorrectedQuery()">{{correctedQuery}}</a>"?
|
||||
</ng-container>
|
||||
|
||||
</p>
|
||||
|
||||
<div *ngIf="!errorMessage" [class.result-content-searching]="searching" infiniteScroll (scrolled)="onScroll()">
|
||||
<p>{{resultCount}} result(s)</p>
|
||||
<app-document-card-large *ngFor="let result of results"
|
||||
[document]="result.document"
|
||||
[details]="result.highlights">
|
||||
|
||||
</app-document-card-large>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { Component, OnInit } from '@angular/core';
|
||||
import { ActivatedRoute } from '@angular/router';
|
||||
import { ActivatedRoute, Router } from '@angular/router';
|
||||
import { SearchHit } from 'src/app/data/search-result';
|
||||
import { SearchService } from 'src/app/services/rest/search.service';
|
||||
|
||||
@ -9,7 +9,7 @@ import { SearchService } from 'src/app/services/rest/search.service';
|
||||
styleUrls: ['./search.component.scss']
|
||||
})
|
||||
export class SearchComponent implements OnInit {
|
||||
|
||||
|
||||
results: SearchHit[] = []
|
||||
|
||||
query: string = ""
|
||||
@ -22,7 +22,11 @@ export class SearchComponent implements OnInit {
|
||||
|
||||
resultCount
|
||||
|
||||
constructor(private searchService: SearchService, private route: ActivatedRoute) { }
|
||||
correctedQuery: string = null
|
||||
|
||||
errorMessage: string
|
||||
|
||||
constructor(private searchService: SearchService, private route: ActivatedRoute, private router: Router) { }
|
||||
|
||||
ngOnInit(): void {
|
||||
this.route.queryParamMap.subscribe(paramMap => {
|
||||
@ -31,10 +35,16 @@ export class SearchComponent implements OnInit {
|
||||
this.currentPage = 1
|
||||
this.loadPage()
|
||||
})
|
||||
|
||||
|
||||
}
|
||||
|
||||
searchCorrectedQuery() {
|
||||
this.router.navigate(["search"], {queryParams: {query: this.correctedQuery}})
|
||||
}
|
||||
|
||||
loadPage(append: boolean = false) {
|
||||
this.errorMessage = null
|
||||
this.correctedQuery = null
|
||||
this.searchService.search(this.query, this.currentPage).subscribe(result => {
|
||||
if (append) {
|
||||
this.results.push(...result.results)
|
||||
@ -44,12 +54,17 @@ export class SearchComponent implements OnInit {
|
||||
this.pageCount = result.page_count
|
||||
this.searching = false
|
||||
this.resultCount = result.count
|
||||
this.correctedQuery = result.corrected_query
|
||||
}, error => {
|
||||
this.searching = false
|
||||
this.resultCount = 1
|
||||
this.pageCount = 1
|
||||
this.results = []
|
||||
this.errorMessage = error.error
|
||||
})
|
||||
}
|
||||
|
||||
onScroll() {
|
||||
console.log(this.currentPage)
|
||||
console.log(this.pageCount)
|
||||
if (this.currentPage < this.pageCount) {
|
||||
this.currentPage += 1
|
||||
this.loadPage(true)
|
||||
|
11
src-ui/src/app/data/paperless-document-metadata.ts
Normal file
11
src-ui/src/app/data/paperless-document-metadata.ts
Normal file
@ -0,0 +1,11 @@
|
||||
export interface PaperlessDocumentMetadata {
|
||||
|
||||
paperless__checksum?: string
|
||||
|
||||
paperless__mime_type?: string
|
||||
|
||||
paperless__filename?: string
|
||||
|
||||
paperless__has_archive_version?: boolean
|
||||
|
||||
}
|
@ -2,16 +2,17 @@ import { PaperlessCorrespondent } from './paperless-correspondent'
|
||||
import { ObjectWithId } from './object-with-id'
|
||||
import { PaperlessTag } from './paperless-tag'
|
||||
import { PaperlessDocumentType } from './paperless-document-type'
|
||||
import { Observable } from 'rxjs'
|
||||
|
||||
export interface PaperlessDocument extends ObjectWithId {
|
||||
|
||||
correspondent?: PaperlessCorrespondent
|
||||
correspondent$?: Observable<PaperlessCorrespondent>
|
||||
|
||||
correspondent_id?: number
|
||||
correspondent?: number
|
||||
|
||||
document_type?: PaperlessDocumentType
|
||||
document_type$?: Observable<PaperlessDocumentType>
|
||||
|
||||
document_type_id?: number
|
||||
document_type?: number
|
||||
|
||||
title?: string
|
||||
|
||||
@ -19,9 +20,9 @@ export interface PaperlessDocument extends ObjectWithId {
|
||||
|
||||
file_type?: string
|
||||
|
||||
tags?: PaperlessTag[]
|
||||
tags$?: Observable<PaperlessTag[]>
|
||||
|
||||
tags_id?: number[]
|
||||
tags?: number[]
|
||||
|
||||
checksum?: string
|
||||
|
||||
|
@ -21,7 +21,9 @@ export interface SearchResult {
|
||||
page?: number
|
||||
page_count?: number
|
||||
|
||||
corrected_query?: string
|
||||
|
||||
results?: SearchHit[]
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -82,6 +82,12 @@ export class DocumentListViewService {
|
||||
this.reload()
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.collectionSize = null
|
||||
this.documents = []
|
||||
this.currentPage = 1
|
||||
}
|
||||
|
||||
reload(onFinish?) {
|
||||
this.isReloading = true
|
||||
this.documentService.list(
|
||||
|
@ -1,5 +1,6 @@
|
||||
import { HttpClient, HttpParams } from '@angular/common/http'
|
||||
import { Observable } from 'rxjs'
|
||||
import { Observable, of, Subject } from 'rxjs'
|
||||
import { map, publishReplay, refCount } from 'rxjs/operators'
|
||||
import { ObjectWithId } from 'src/app/data/object-with-id'
|
||||
import { Results } from 'src/app/data/results'
|
||||
import { environment } from 'src/environments/environment'
|
||||
@ -51,8 +52,28 @@ export abstract class AbstractPaperlessService<T extends ObjectWithId> {
|
||||
return this.http.get<Results<T>>(this.getResourceUrl(), {params: httpParams})
|
||||
}
|
||||
|
||||
private _listAll: Observable<Results<T>>
|
||||
|
||||
listAll(ordering?: string, extraParams?): Observable<Results<T>> {
|
||||
return this.list(1, 100000, ordering, extraParams)
|
||||
if (!this._listAll) {
|
||||
this._listAll = this.list(1, 100000, ordering, extraParams).pipe(
|
||||
publishReplay(1),
|
||||
refCount()
|
||||
)
|
||||
}
|
||||
return this._listAll
|
||||
}
|
||||
|
||||
getCached(id: number): Observable<T> {
|
||||
return this.listAll().pipe(
|
||||
map(list => list.results.find(o => o.id == id))
|
||||
)
|
||||
}
|
||||
|
||||
getCachedMany(ids: number[]): Observable<T[]> {
|
||||
return this.listAll().pipe(
|
||||
map(list => ids.map(id => list.results.find(o => o.id == id)))
|
||||
)
|
||||
}
|
||||
|
||||
get(id: number): Observable<T> {
|
||||
@ -60,14 +81,17 @@ export abstract class AbstractPaperlessService<T extends ObjectWithId> {
|
||||
}
|
||||
|
||||
create(o: T): Observable<T> {
|
||||
this._listAll = null
|
||||
return this.http.post<T>(this.getResourceUrl(), o)
|
||||
}
|
||||
|
||||
delete(o: T): Observable<any> {
|
||||
this._listAll = null
|
||||
return this.http.delete(this.getResourceUrl(o.id))
|
||||
}
|
||||
|
||||
update(o: T): Observable<T> {
|
||||
this._listAll = null
|
||||
return this.http.put<T>(this.getResourceUrl(o.id), o)
|
||||
}
|
||||
}
|
@ -1,10 +1,15 @@
|
||||
import { Injectable } from '@angular/core';
|
||||
import { PaperlessDocument } from 'src/app/data/paperless-document';
|
||||
import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata';
|
||||
import { AbstractPaperlessService } from './abstract-paperless-service';
|
||||
import { HttpClient } from '@angular/common/http';
|
||||
import { Observable } from 'rxjs';
|
||||
import { Results } from 'src/app/data/results';
|
||||
import { FilterRule } from 'src/app/data/filter-rule';
|
||||
import { map } from 'rxjs/operators';
|
||||
import { CorrespondentService } from './correspondent.service';
|
||||
import { DocumentTypeService } from './document-type.service';
|
||||
import { TagService } from './tag.service';
|
||||
|
||||
|
||||
export const DOCUMENT_SORT_FIELDS = [
|
||||
@ -26,7 +31,7 @@ export const SORT_DIRECTION_DESCENDING = "des"
|
||||
})
|
||||
export class DocumentService extends AbstractPaperlessService<PaperlessDocument> {
|
||||
|
||||
constructor(http: HttpClient) {
|
||||
constructor(http: HttpClient, private correspondentService: CorrespondentService, private documentTypeService: DocumentTypeService, private tagService: TagService) {
|
||||
super(http, 'documents')
|
||||
}
|
||||
|
||||
@ -46,26 +51,56 @@ export class DocumentService extends AbstractPaperlessService<PaperlessDocument>
|
||||
}
|
||||
}
|
||||
|
||||
list(page?: number, pageSize?: number, sortField?: string, sortDirection?: string, filterRules?: FilterRule[]): Observable<Results<PaperlessDocument>> {
|
||||
return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules))
|
||||
addObservablesToDocument(doc: PaperlessDocument) {
|
||||
if (doc.correspondent) {
|
||||
doc.correspondent$ = this.correspondentService.getCached(doc.correspondent)
|
||||
}
|
||||
if (doc.document_type) {
|
||||
doc.document_type$ = this.documentTypeService.getCached(doc.document_type)
|
||||
}
|
||||
if (doc.tags) {
|
||||
doc.tags$ = this.tagService.getCachedMany(doc.tags)
|
||||
}
|
||||
return doc
|
||||
}
|
||||
|
||||
getPreviewUrl(id: number): string {
|
||||
return this.getResourceUrl(id, 'preview')
|
||||
list(page?: number, pageSize?: number, sortField?: string, sortDirection?: string, filterRules?: FilterRule[]): Observable<Results<PaperlessDocument>> {
|
||||
return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules)).pipe(
|
||||
map(results => {
|
||||
results.results.forEach(doc => this.addObservablesToDocument(doc))
|
||||
return results
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
getPreviewUrl(id: number, original: boolean = false): string {
|
||||
let url = this.getResourceUrl(id, 'preview')
|
||||
if (original) {
|
||||
url += "?original=true"
|
||||
}
|
||||
return url
|
||||
}
|
||||
|
||||
getThumbUrl(id: number): string {
|
||||
return this.getResourceUrl(id, 'thumb')
|
||||
}
|
||||
|
||||
getDownloadUrl(id: number): string {
|
||||
return this.getResourceUrl(id, 'download')
|
||||
getDownloadUrl(id: number, original: boolean = false): string {
|
||||
let url = this.getResourceUrl(id, 'download')
|
||||
if (original) {
|
||||
url += "?original=true"
|
||||
}
|
||||
return url
|
||||
}
|
||||
|
||||
uploadDocument(formData) {
|
||||
return this.http.post(this.getResourceUrl(null, 'post_document'), formData)
|
||||
}
|
||||
|
||||
getMetadata(id: number): Observable<PaperlessDocumentMetadata> {
|
||||
return this.http.get<PaperlessDocumentMetadata>(this.getResourceUrl(id, 'metadata'))
|
||||
}
|
||||
|
||||
bulk_edit(ids: number[], method: string, args: any[]) {
|
||||
return this.http.post(this.getResourceUrl(null, 'bulk_edit'), {
|
||||
'ids': ids,
|
||||
|
@ -1,9 +1,11 @@
|
||||
import { HttpClient, HttpParams } from '@angular/common/http';
|
||||
import { Injectable } from '@angular/core';
|
||||
import { Observable } from 'rxjs';
|
||||
import { map } from 'rxjs/operators';
|
||||
import { PaperlessDocument } from 'src/app/data/paperless-document';
|
||||
import { SearchResult } from 'src/app/data/search-result';
|
||||
import { environment } from 'src/environments/environment';
|
||||
import { DocumentService } from './document.service';
|
||||
|
||||
|
||||
@Injectable({
|
||||
@ -11,14 +13,19 @@ import { environment } from 'src/environments/environment';
|
||||
})
|
||||
export class SearchService {
|
||||
|
||||
constructor(private http: HttpClient) { }
|
||||
constructor(private http: HttpClient, private documentService: DocumentService) { }
|
||||
|
||||
search(query: string, page?: number): Observable<SearchResult> {
|
||||
let httpParams = new HttpParams().set('query', query)
|
||||
if (page) {
|
||||
httpParams = httpParams.set('page', page.toString())
|
||||
}
|
||||
return this.http.get<SearchResult>(`${environment.apiBaseUrl}search/`, {params: httpParams})
|
||||
return this.http.get<SearchResult>(`${environment.apiBaseUrl}search/`, {params: httpParams}).pipe(
|
||||
map(result => {
|
||||
result.results.forEach(hit => this.documentService.addObservablesToDocument(hit.document))
|
||||
return result
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
autocomplete(term: string): Observable<string[]> {
|
||||
|
BIN
src-ui/src/assets/save-filter.png
Normal file
BIN
src-ui/src/assets/save-filter.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 8.1 KiB |
@ -6,13 +6,15 @@ import os
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import transaction
|
||||
from django.db.models import Q
|
||||
from django.utils import timezone
|
||||
|
||||
from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
|
||||
from .file_handling import generate_filename, create_source_path_directory
|
||||
from .file_handling import create_source_path_directory
|
||||
from .loggers import LoggingMixin
|
||||
from .models import Document, FileInfo, Correspondent, DocumentType, Tag
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type
|
||||
from .parsers import ParseError, get_parser_class_for_mime_type, \
|
||||
get_supported_file_extensions, parse_date
|
||||
from .signals import (
|
||||
document_consumption_finished,
|
||||
document_consumption_started
|
||||
@ -42,7 +44,7 @@ class Consumer(LoggingMixin):
|
||||
def pre_check_duplicate(self):
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
if Document.objects.filter(checksum=checksum).exists():
|
||||
if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists(): # NOQA: E501
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
raise ConsumerError(
|
||||
@ -53,6 +55,7 @@ class Consumer(LoggingMixin):
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
|
||||
|
||||
def try_consume_file(self,
|
||||
path,
|
||||
@ -107,7 +110,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
document_parser = parser_class(self.path, self.logging_group)
|
||||
document_parser = parser_class(self.logging_group)
|
||||
|
||||
# However, this already created working directories which we have to
|
||||
# clean up.
|
||||
@ -115,13 +118,24 @@ class Consumer(LoggingMixin):
|
||||
# Parse the document. This may take some time.
|
||||
|
||||
try:
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
thumbnail = document_parser.get_optimised_thumbnail()
|
||||
self.log("debug", "Parsing {}...".format(self.filename))
|
||||
document_parser.parse(self.path, mime_type)
|
||||
|
||||
self.log("debug", f"Generating thumbnail for {self.filename}...")
|
||||
thumbnail = document_parser.get_optimised_thumbnail(
|
||||
self.path, mime_type)
|
||||
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if not date:
|
||||
date = parse_date(self.filename, text)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
|
||||
except ParseError as e:
|
||||
document_parser.cleanup()
|
||||
self.log(
|
||||
"error",
|
||||
f"Error while consuming document {self.filename}: {e}")
|
||||
raise ConsumerError(e)
|
||||
|
||||
# Prepare the document classifier.
|
||||
@ -163,9 +177,24 @@ class Consumer(LoggingMixin):
|
||||
# After everything is in the database, copy the files into
|
||||
# place. If this fails, we'll also rollback the transaction.
|
||||
|
||||
# TODO: not required, since this is done by the file handling
|
||||
# logic
|
||||
create_source_path_directory(document.source_path)
|
||||
self._write(document, self.path, document.source_path)
|
||||
self._write(document, thumbnail, document.thumbnail_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
self.path, document.source_path)
|
||||
|
||||
self._write(document.storage_type,
|
||||
thumbnail, document.thumbnail_path)
|
||||
|
||||
if archive_path and os.path.isfile(archive_path):
|
||||
self._write(document.storage_type,
|
||||
archive_path, document.archive_path)
|
||||
|
||||
with open(archive_path, 'rb') as f:
|
||||
document.archive_checksum = hashlib.md5(
|
||||
f.read()).hexdigest()
|
||||
document.save()
|
||||
|
||||
# Afte performing all database operations and moving files
|
||||
# into place, tell paperless where the file is.
|
||||
@ -178,6 +207,11 @@ class Consumer(LoggingMixin):
|
||||
self.log("debug", "Deleting file {}".format(self.path))
|
||||
os.unlink(self.path)
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"error",
|
||||
f"The following error occured while consuming "
|
||||
f"{self.filename}: {e}"
|
||||
)
|
||||
raise ConsumerError(e)
|
||||
finally:
|
||||
document_parser.cleanup()
|
||||
@ -242,7 +276,7 @@ class Consumer(LoggingMixin):
|
||||
for tag_id in self.override_tag_ids:
|
||||
document.tags.add(Tag.objects.get(pk=tag_id))
|
||||
|
||||
def _write(self, document, source, target):
|
||||
def _write(self, storage_type, source, target):
|
||||
with open(source, "rb") as read_file:
|
||||
with open(target, "wb") as write_file:
|
||||
write_file.write(read_file.read())
|
||||
|
@ -10,10 +10,13 @@ def create_source_path_directory(source_path):
|
||||
os.makedirs(os.path.dirname(source_path), exist_ok=True)
|
||||
|
||||
|
||||
def delete_empty_directories(directory):
|
||||
def delete_empty_directories(directory, root):
|
||||
if not os.path.isdir(directory):
|
||||
return
|
||||
|
||||
# Go up in the directory hierarchy and try to delete all directories
|
||||
directory = os.path.normpath(directory)
|
||||
root = os.path.normpath(settings.ORIGINALS_DIR)
|
||||
root = os.path.normpath(root)
|
||||
|
||||
if not directory.startswith(root + os.path.sep):
|
||||
# don't do anything outside our originals folder.
|
||||
@ -101,3 +104,8 @@ def generate_filename(doc):
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def archive_name_from_filename(filename):
|
||||
|
||||
return os.path.splitext(filename)[0] + ".pdf"
|
||||
|
@ -1,59 +0,0 @@
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
import magic
|
||||
from django import forms
|
||||
from django.conf import settings
|
||||
from django_q.tasks import async_task
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
|
||||
from documents.parsers import is_mime_type_supported
|
||||
|
||||
|
||||
class UploadForm(forms.Form):
|
||||
|
||||
document = forms.FileField()
|
||||
|
||||
def clean_document(self):
|
||||
document_name = self.cleaned_data.get("document").name
|
||||
|
||||
try:
|
||||
validate_filename(document_name)
|
||||
except ValidationError:
|
||||
raise forms.ValidationError("That filename is suspicious.")
|
||||
|
||||
document_data = self.cleaned_data.get("document").read()
|
||||
|
||||
mime_type = magic.from_buffer(document_data, mime=True)
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
raise forms.ValidationError("This mime type is not supported.")
|
||||
|
||||
return document_name, document_data
|
||||
|
||||
def save(self):
|
||||
"""
|
||||
Since the consumer already does a lot of work, it's easier just to save
|
||||
to-be-consumed files to the consumption directory rather than have the
|
||||
form do that as well. Think of it as a poor-man's queue server.
|
||||
"""
|
||||
|
||||
original_filename, data = self.cleaned_data.get("document")
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
|
||||
f.write(data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file",
|
||||
f.name,
|
||||
override_filename=original_filename,
|
||||
task_name=os.path.basename(original_filename)[:100])
|
@ -4,10 +4,11 @@ from contextlib import contextmanager
|
||||
|
||||
from django.conf import settings
|
||||
from whoosh import highlight
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC
|
||||
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
|
||||
from whoosh.highlight import Formatter, get_text
|
||||
from whoosh.index import create_in, exists_in, open_dir
|
||||
from whoosh.qparser import MultifieldParser
|
||||
from whoosh.qparser.dateparse import DateParserPlugin
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
|
||||
@ -59,14 +60,19 @@ def get_schema():
|
||||
id=NUMERIC(stored=True, unique=True, numtype=int),
|
||||
title=TEXT(stored=True),
|
||||
content=TEXT(),
|
||||
correspondent=TEXT(stored=True)
|
||||
correspondent=TEXT(stored=True),
|
||||
tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
|
||||
type=TEXT(stored=True),
|
||||
created=DATETIME(stored=True, sortable=True),
|
||||
modified=DATETIME(stored=True, sortable=True),
|
||||
added=DATETIME(stored=True, sortable=True),
|
||||
)
|
||||
|
||||
|
||||
def open_index(recreate=False):
|
||||
try:
|
||||
if exists_in(settings.INDEX_DIR) and not recreate:
|
||||
return open_dir(settings.INDEX_DIR)
|
||||
return open_dir(settings.INDEX_DIR, schema=get_schema())
|
||||
except Exception as e:
|
||||
logger.error(f"Error while opening the index: {e}, recreating.")
|
||||
|
||||
@ -76,16 +82,27 @@ def open_index(recreate=False):
|
||||
|
||||
|
||||
def update_document(writer, doc):
|
||||
# TODO: this line caused many issues all around, since:
|
||||
# We need to make sure that this method does not get called with
|
||||
# deserialized documents (i.e, document objects that don't come from
|
||||
# Django's ORM interfaces directly.
|
||||
logger.debug("Indexing {}...".format(doc))
|
||||
tags = ",".join([t.name for t in doc.tags.all()])
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
title=doc.title,
|
||||
content=doc.content,
|
||||
correspondent=doc.correspondent.name if doc.correspondent else None
|
||||
correspondent=doc.correspondent.name if doc.correspondent else None,
|
||||
tag=tags if tags else None,
|
||||
type=doc.document_type.name if doc.document_type else None,
|
||||
created=doc.created,
|
||||
added=doc.added,
|
||||
modified=doc.modified,
|
||||
)
|
||||
|
||||
|
||||
def remove_document(writer, doc):
|
||||
# TODO: see above.
|
||||
logger.debug("Removing {} from index...".format(doc))
|
||||
writer.delete_by_term('id', doc.pk)
|
||||
|
||||
@ -103,16 +120,27 @@ def remove_document_from_index(document):
|
||||
|
||||
|
||||
@contextmanager
|
||||
def query_page(ix, query, page):
|
||||
def query_page(ix, querystring, page):
|
||||
searcher = ix.searcher()
|
||||
try:
|
||||
query_parser = MultifieldParser(["content", "title", "correspondent"],
|
||||
ix.schema).parse(query)
|
||||
result_page = searcher.search_page(query_parser, page)
|
||||
qp = MultifieldParser(
|
||||
["content", "title", "correspondent", "tag", "type"],
|
||||
ix.schema)
|
||||
qp.add_plugin(DateParserPlugin())
|
||||
|
||||
q = qp.parse(querystring)
|
||||
result_page = searcher.search_page(q, page)
|
||||
result_page.results.fragmenter = highlight.ContextFragmenter(
|
||||
surround=50)
|
||||
result_page.results.formatter = JsonFormatter()
|
||||
yield result_page
|
||||
|
||||
corrected = searcher.correct_query(q, querystring)
|
||||
if corrected.query != q:
|
||||
corrected_query = corrected.string
|
||||
else:
|
||||
corrected_query = None
|
||||
|
||||
yield result_page, corrected_query
|
||||
finally:
|
||||
searcher.close()
|
||||
|
||||
|
@ -28,10 +28,10 @@ class LoggingMixin:
|
||||
def renew_logging_group(self):
|
||||
self.logging_group = uuid.uuid4()
|
||||
|
||||
def log(self, level, message):
|
||||
def log(self, level, message, **kwargs):
|
||||
target = ".".join([self.__class__.__module__, self.__class__.__name__])
|
||||
logger = logging.getLogger(target)
|
||||
|
||||
getattr(logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
}, **kwargs)
|
||||
|
128
src/documents/management/commands/document_archiver.py
Normal file
128
src/documents/management/commands/document_archiver.py
Normal file
@ -0,0 +1,128 @@
|
||||
import hashlib
|
||||
import multiprocessing
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import transaction
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents.models import Document
|
||||
from ... import index
|
||||
from ...file_handling import create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
from ...parsers import get_parser_class_for_mime_type
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def handle_document(document_id):
|
||||
document = Document.objects.get(id=document_id)
|
||||
|
||||
mime_type = document.mime_type
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
|
||||
parser = parser_class(logging_group=uuid.uuid4())
|
||||
|
||||
try:
|
||||
parser.parse(document.source_path, mime_type)
|
||||
|
||||
if parser.get_archive_path():
|
||||
with transaction.atomic():
|
||||
with open(parser.get_archive_path(), 'rb') as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
# i'm going to save first so that in case the file move
|
||||
# fails, the database is rolled back.
|
||||
# we also don't use save() since that triggers the filehandling
|
||||
# logic, and we don't want that yet (file not yet in place)
|
||||
Document.objects.filter(pk=document.pk).update(
|
||||
archive_checksum=checksum,
|
||||
content=parser.get_text()
|
||||
)
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.move(parser.get_archive_path(), document.archive_path)
|
||||
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, document)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error while parsing document {document}: {str(e)}")
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
class Command(Renderable, BaseCommand):
|
||||
|
||||
help = """
|
||||
Using the current classification model, assigns correspondents, tags
|
||||
and document types to all documents, effectively allowing you to
|
||||
back-tag all previously indexed documents with metadata created (or
|
||||
modified) after their initial import.
|
||||
""".replace(" ", "")
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.verbosity = 0
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
"-f", "--overwrite",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Recreates the archived document for documents that already "
|
||||
"have an archived version."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d", "--document",
|
||||
default=None,
|
||||
type=int,
|
||||
required=False,
|
||||
help="Specify the ID of a document, and this command will only "
|
||||
"run on this specific document."
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
overwrite = options["overwrite"]
|
||||
|
||||
if options['document']:
|
||||
documents = Document.objects.filter(pk=options['document'])
|
||||
else:
|
||||
documents = Document.objects.all()
|
||||
|
||||
document_ids = list(map(
|
||||
lambda doc: doc.id,
|
||||
filter(
|
||||
lambda d: overwrite or not d.archive_checksum,
|
||||
documents
|
||||
)
|
||||
))
|
||||
|
||||
# Note to future self: this prevents django from reusing database
|
||||
# conncetions between processes, which is bad and does not work
|
||||
# with postgres.
|
||||
db.connections.close_all()
|
||||
|
||||
try:
|
||||
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
|
||||
list(tqdm.tqdm(
|
||||
pool.imap_unordered(
|
||||
handle_document,
|
||||
document_ids
|
||||
),
|
||||
total=len(document_ids)
|
||||
))
|
||||
except KeyboardInterrupt:
|
||||
print("Aborting...")
|
@ -1,31 +1,69 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.utils.text import slugify
|
||||
from django_q.tasks import async_task
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
|
||||
from documents.models import Tag
|
||||
from documents.parsers import is_file_ext_supported
|
||||
|
||||
try:
|
||||
from inotify_simple import INotify, flags
|
||||
from inotifyrecursive import INotify, flags
|
||||
except ImportError:
|
||||
INotify = flags = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _consume(file):
|
||||
try:
|
||||
if os.path.isfile(file):
|
||||
async_task("documents.tasks.consume_file",
|
||||
file,
|
||||
task_name=os.path.basename(file)[:100])
|
||||
else:
|
||||
logger.debug(
|
||||
f"Not consuming file {file}: File has moved.")
|
||||
def _tags_from_path(filepath):
|
||||
"""Walk up the directory tree from filepath to CONSUMPTION_DIr
|
||||
and get or create Tag IDs for every directory.
|
||||
"""
|
||||
tag_ids = set()
|
||||
path_parts = Path(filepath).relative_to(
|
||||
settings.CONSUMPTION_DIR).parent.parts
|
||||
for part in path_parts:
|
||||
tag_ids.add(Tag.objects.get_or_create(
|
||||
slug=slugify(part),
|
||||
defaults={"name": part},
|
||||
)[0].pk)
|
||||
|
||||
return tag_ids
|
||||
|
||||
|
||||
def _consume(filepath):
|
||||
if os.path.isdir(filepath):
|
||||
return
|
||||
|
||||
if not os.path.isfile(filepath):
|
||||
logger.debug(
|
||||
f"Not consuming file {filepath}: File has moved.")
|
||||
return
|
||||
|
||||
if not is_file_ext_supported(os.path.splitext(filepath)[1]):
|
||||
logger.debug(
|
||||
f"Not consuming file {filepath}: Unknown file extension.")
|
||||
return
|
||||
|
||||
tag_ids = None
|
||||
try:
|
||||
if settings.CONSUMER_SUBDIRS_AS_TAGS:
|
||||
tag_ids = _tags_from_path(filepath)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error creating tags from path: {}".format(e))
|
||||
|
||||
try:
|
||||
async_task("documents.tasks.consume_file",
|
||||
filepath,
|
||||
override_tag_ids=tag_ids if tag_ids else None,
|
||||
task_name=os.path.basename(filepath)[:100])
|
||||
except Exception as e:
|
||||
# Catch all so that the consumer won't crash.
|
||||
# This is also what the test case is listening for to check for
|
||||
@ -94,6 +132,7 @@ class Command(BaseCommand):
|
||||
|
||||
def handle(self, *args, **options):
|
||||
directory = options["directory"]
|
||||
recursive = settings.CONSUMER_RECURSIVE
|
||||
|
||||
if not directory:
|
||||
raise CommandError(
|
||||
@ -104,24 +143,30 @@ class Command(BaseCommand):
|
||||
raise CommandError(
|
||||
f"Consumption directory {directory} does not exist")
|
||||
|
||||
for entry in os.scandir(directory):
|
||||
_consume(entry.path)
|
||||
if recursive:
|
||||
for dirpath, _, filenames in os.walk(directory):
|
||||
for filename in filenames:
|
||||
filepath = os.path.join(dirpath, filename)
|
||||
_consume(filepath)
|
||||
else:
|
||||
for entry in os.scandir(directory):
|
||||
_consume(entry.path)
|
||||
|
||||
if options["oneshot"]:
|
||||
return
|
||||
|
||||
if settings.CONSUMER_POLLING == 0 and INotify:
|
||||
self.handle_inotify(directory)
|
||||
self.handle_inotify(directory, recursive)
|
||||
else:
|
||||
self.handle_polling(directory)
|
||||
self.handle_polling(directory, recursive)
|
||||
|
||||
logger.debug("Consumer exiting.")
|
||||
|
||||
def handle_polling(self, directory):
|
||||
def handle_polling(self, directory, recursive):
|
||||
logging.getLogger(__name__).info(
|
||||
f"Polling directory for changes: {directory}")
|
||||
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
||||
self.observer.schedule(Handler(), directory, recursive=False)
|
||||
self.observer.schedule(Handler(), directory, recursive=recursive)
|
||||
self.observer.start()
|
||||
try:
|
||||
while self.observer.is_alive():
|
||||
@ -132,18 +177,26 @@ class Command(BaseCommand):
|
||||
self.observer.stop()
|
||||
self.observer.join()
|
||||
|
||||
def handle_inotify(self, directory):
|
||||
def handle_inotify(self, directory, recursive):
|
||||
logging.getLogger(__name__).info(
|
||||
f"Using inotify to watch directory for changes: {directory}")
|
||||
|
||||
inotify = INotify()
|
||||
descriptor = inotify.add_watch(
|
||||
directory, flags.CLOSE_WRITE | flags.MOVED_TO)
|
||||
inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO
|
||||
if recursive:
|
||||
descriptor = inotify.add_watch_recursive(directory, inotify_flags)
|
||||
else:
|
||||
descriptor = inotify.add_watch(directory, inotify_flags)
|
||||
|
||||
try:
|
||||
while not self.stop_flag:
|
||||
for event in inotify.read(timeout=1000, read_delay=1000):
|
||||
file = os.path.join(directory, event.name)
|
||||
_consume(file)
|
||||
for event in inotify.read(timeout=1000):
|
||||
if recursive:
|
||||
path = inotify.get_path(event.wd)
|
||||
else:
|
||||
path = directory
|
||||
filepath = os.path.join(path, event.name)
|
||||
_consume(filepath)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
|
@ -7,7 +7,8 @@ from django.core import serializers
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document, Correspondent, Tag, DocumentType
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...mixins import Renderable
|
||||
|
||||
@ -54,7 +55,6 @@ class Command(Renderable, BaseCommand):
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
unique_filename = f"{document.pk:07}_{document.file_name}"
|
||||
|
||||
file_target = os.path.join(self.target, unique_filename)
|
||||
|
||||
thumbnail_name = unique_filename + "-thumbnail.png"
|
||||
@ -63,6 +63,14 @@ class Command(Renderable, BaseCommand):
|
||||
document_dict[EXPORTER_FILE_NAME] = unique_filename
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
if os.path.exists(document.archive_path):
|
||||
archive_name = \
|
||||
f"{document.pk:07}_archive_{document.archive_file_name}"
|
||||
archive_target = os.path.join(self.target, archive_name)
|
||||
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
|
||||
else:
|
||||
archive_target = None
|
||||
|
||||
print(f"Exporting: {file_target}")
|
||||
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
@ -76,11 +84,18 @@ class Command(Renderable, BaseCommand):
|
||||
f.write(GnuPG.decrypted(document.thumbnail_file))
|
||||
os.utime(thumbnail_target, times=(t, t))
|
||||
|
||||
if archive_target:
|
||||
with open(archive_target, "wb") as f:
|
||||
f.write(GnuPG.decrypted(document.archive_path))
|
||||
os.utime(archive_target, times=(t, t))
|
||||
else:
|
||||
|
||||
shutil.copy(document.source_path, file_target)
|
||||
shutil.copy(document.thumbnail_path, thumbnail_target)
|
||||
|
||||
if archive_target:
|
||||
shutil.copy(document.archive_path, archive_target)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
|
||||
|
@ -7,8 +7,8 @@ from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
|
||||
from documents.models import Document
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
|
||||
from paperless.db import GnuPG
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from ...file_handling import generate_filename, create_source_path_directory
|
||||
from ...mixins import Renderable
|
||||
|
||||
@ -79,23 +79,41 @@ class Command(Renderable, BaseCommand):
|
||||
'appear to be in the source directory.'.format(doc_file)
|
||||
)
|
||||
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
if not os.path.exists(os.path.join(self.source, archive_file)):
|
||||
raise CommandError(
|
||||
f"The manifest file refers to {archive_file} which "
|
||||
f"does not appear to be in the source directory."
|
||||
)
|
||||
|
||||
def _import_files_from_manifest(self):
|
||||
|
||||
storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
|
||||
|
||||
for record in self.manifest:
|
||||
|
||||
if not record["model"] == "documents.document":
|
||||
continue
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
document_path = os.path.join(self.source, doc_file)
|
||||
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
thumbnail_path = os.path.join(self.source, thumb_file)
|
||||
|
||||
document.storage_type = storage_type
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
archive_path = os.path.join(self.source, archive_file)
|
||||
else:
|
||||
archive_path = None
|
||||
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
document.filename = generate_filename(document)
|
||||
|
||||
if os.path.isfile(document.source_path):
|
||||
@ -106,5 +124,7 @@ class Command(Renderable, BaseCommand):
|
||||
print(f"Moving {document_path} to {document.source_path}")
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
if archive_path:
|
||||
shutil.copy(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
|
23
src/documents/migrations/1005_checksums.py
Normal file
23
src/documents/migrations/1005_checksums.py
Normal file
@ -0,0 +1,23 @@
|
||||
# Generated by Django 3.1.3 on 2020-11-29 00:48
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1004_sanity_check_schedule'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='archive_checksum',
|
||||
field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='checksum',
|
||||
field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
|
||||
),
|
||||
]
|
@ -1,7 +1,6 @@
|
||||
# coding=utf-8
|
||||
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
@ -12,6 +11,9 @@ from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.text import slugify
|
||||
|
||||
from documents.file_handling import archive_name_from_filename
|
||||
from documents.parsers import get_default_file_extension
|
||||
|
||||
|
||||
class MatchingModel(models.Model):
|
||||
|
||||
@ -157,9 +159,15 @@ class Document(models.Model):
|
||||
max_length=32,
|
||||
editable=False,
|
||||
unique=True,
|
||||
help_text="The checksum of the original document (before it was "
|
||||
"encrypted). We use this to prevent duplicate document "
|
||||
"imports."
|
||||
help_text="The checksum of the original document."
|
||||
)
|
||||
|
||||
archive_checksum = models.CharField(
|
||||
max_length=32,
|
||||
editable=False,
|
||||
blank=True,
|
||||
null=True,
|
||||
help_text="The checksum of the archived document."
|
||||
)
|
||||
|
||||
created = models.DateTimeField(
|
||||
@ -198,7 +206,7 @@ class Document(models.Model):
|
||||
ordering = ("correspondent", "title")
|
||||
|
||||
def __str__(self):
|
||||
created = self.created.strftime("%Y%m%d%H%M%S")
|
||||
created = self.created.strftime("%Y%m%d")
|
||||
if self.correspondent and self.title:
|
||||
return "{}: {} - {}".format(
|
||||
created, self.correspondent, self.title)
|
||||
@ -224,14 +232,33 @@ class Document(models.Model):
|
||||
def source_file(self):
|
||||
return open(self.source_path, "rb")
|
||||
|
||||
@property
|
||||
def archive_path(self):
|
||||
if self.filename:
|
||||
fname = archive_name_from_filename(self.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(self.pk)
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
@property
|
||||
def archive_file(self):
|
||||
return open(self.archive_path, "rb")
|
||||
|
||||
@property
|
||||
def file_name(self):
|
||||
return slugify(str(self)) + self.file_type
|
||||
|
||||
@property
|
||||
def archive_file_name(self):
|
||||
return slugify(str(self)) + ".pdf"
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
# TODO: this is not stable across python versions
|
||||
return mimetypes.guess_extension(str(self.mime_type))
|
||||
return get_default_file_extension(self.mime_type)
|
||||
|
||||
@property
|
||||
def thumbnail_path(self):
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
@ -42,6 +43,40 @@ def is_mime_type_supported(mime_type):
|
||||
return get_parser_class_for_mime_type(mime_type) is not None
|
||||
|
||||
|
||||
def get_default_file_extension(mime_type):
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
return supported_mime_types[mime_type]
|
||||
|
||||
ext = mimetypes.guess_extension(mime_type)
|
||||
if ext:
|
||||
return ext
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
def is_file_ext_supported(ext):
|
||||
if ext:
|
||||
return ext.lower() in get_supported_file_extensions()
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def get_supported_file_extensions():
|
||||
extensions = set()
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
for mime_type in supported_mime_types:
|
||||
extensions.update(mimetypes.guess_all_extensions(mime_type))
|
||||
|
||||
return extensions
|
||||
|
||||
|
||||
def get_parser_class_for_mime_type(mime_type):
|
||||
|
||||
options = []
|
||||
@ -107,21 +142,59 @@ def run_convert(input_file,
|
||||
raise ParseError("Convert failed at {}".format(args))
|
||||
|
||||
|
||||
def run_unpaper(pnm, logging_group=None):
|
||||
pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
|
||||
def parse_date(filename, text):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
|
||||
command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
|
||||
pnm_out)
|
||||
def __parser(ds, date_order):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE":
|
||||
True
|
||||
}
|
||||
)
|
||||
|
||||
logger.debug(f"Execute: {' '.join(command_args)}",
|
||||
extra={'group': logging_group})
|
||||
date = None
|
||||
|
||||
if not subprocess.Popen(command_args,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL).wait() == 0:
|
||||
raise ParseError(f"Unpaper failed at {command_args}")
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
|
||||
return pnm_out
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
for m in re.finditer(DATE_REGEX, filename):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
return date
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
return date
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
@ -134,26 +207,35 @@ class DocumentParser(LoggingMixin):
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group):
|
||||
def __init__(self, logging_group):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(
|
||||
prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
|
||||
def get_thumbnail(self):
|
||||
self.archive_path = None
|
||||
self.text = None
|
||||
self.date = None
|
||||
|
||||
def parse(self, document_path, mime_type):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_archive_path(self):
|
||||
return self.archive_path
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
Returns the path to a file we can use as a thumbnail for this document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def optimise_thumbnail(self, in_path):
|
||||
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
thumbnail = self.get_thumbnail(document_path, mime_type)
|
||||
if settings.OPTIMIZE_THUMBNAILS:
|
||||
out_path = os.path.join(self.tempdir, "optipng.png")
|
||||
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
|
||||
|
||||
args = (settings.OPTIPNG_BINARY,
|
||||
"-silent", "-o5", in_path, "-out", out_path)
|
||||
"-silent", "-o5", thumbnail, "-out", out_path)
|
||||
|
||||
self.log('debug', f"Execute: {' '.join(args)}")
|
||||
|
||||
@ -162,97 +244,13 @@ class DocumentParser(LoggingMixin):
|
||||
|
||||
return out_path
|
||||
else:
|
||||
return in_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
return self.optimise_thumbnail(self.get_thumbnail())
|
||||
return thumbnail
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Returns the text from the document and only the text.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
return self.text
|
||||
|
||||
def get_date(self):
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
|
||||
def __parser(ds, date_order):
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE":
|
||||
True
|
||||
}
|
||||
)
|
||||
|
||||
date = None
|
||||
date_string = None
|
||||
|
||||
next_year = timezone.now().year + 5 # Arbitrary 5 year future limit
|
||||
title = os.path.basename(self.document_path)
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
self.log("info", "Checking document title for date")
|
||||
for m in re.finditer(DATE_REGEX, title):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {} "
|
||||
"from document title"
|
||||
"".format(date.isoformat(), date_string)
|
||||
)
|
||||
return date
|
||||
|
||||
try:
|
||||
# getting text after checking filename will save time if only
|
||||
# looking at the filename instead of the whole text
|
||||
text = self.get_text()
|
||||
except ParseError:
|
||||
return None
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
for m in re.finditer(DATE_REGEX, text):
|
||||
date_string = m.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, settings.DATE_ORDER)
|
||||
except (TypeError, ValueError):
|
||||
# Skip all matches that do not parse to a proper date
|
||||
continue
|
||||
|
||||
if date is not None and next_year > date.year > 1900:
|
||||
break
|
||||
else:
|
||||
date = None
|
||||
|
||||
if date is not None:
|
||||
self.log(
|
||||
"info",
|
||||
"Detected document date {} based on string {}".format(
|
||||
date.isoformat(),
|
||||
date_string
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.log("info", "Unable to detect date for document")
|
||||
|
||||
return date
|
||||
return self.date
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
|
@ -47,7 +47,7 @@ def check_sanity():
|
||||
present_files.append(os.path.normpath(os.path.join(root, f)))
|
||||
|
||||
for doc in Document.objects.all():
|
||||
# Check thumbnail
|
||||
# Check sanity of the thumbnail
|
||||
if not os.path.isfile(doc.thumbnail_path):
|
||||
messages.append(SanityError(
|
||||
f"Thumbnail of document {doc.pk} does not exist."))
|
||||
@ -61,26 +61,49 @@ def check_sanity():
|
||||
f"Cannot read thumbnail file of document {doc.pk}: {e}"
|
||||
))
|
||||
|
||||
# Check document
|
||||
# Check sanity of the original file
|
||||
# TODO: extract method
|
||||
if not os.path.isfile(doc.source_path):
|
||||
messages.append(SanityError(
|
||||
f"Original of document {doc.pk} does not exist."))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.source_path))
|
||||
checksum = None
|
||||
try:
|
||||
with doc.source_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read original file of document {doc.pk}: {e}"))
|
||||
else:
|
||||
if not checksum == doc.checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
if checksum and not checksum == doc.checksum:
|
||||
# Check sanity of the archive file.
|
||||
if doc.archive_checksum:
|
||||
if not os.path.isfile(doc.archive_path):
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of document {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
f"Archived version of document {doc.pk} does not exist."
|
||||
))
|
||||
else:
|
||||
present_files.remove(os.path.normpath(doc.archive_path))
|
||||
try:
|
||||
with doc.archive_file as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.append(SanityError(
|
||||
f"Cannot read archive file of document {doc.pk}: {e}"
|
||||
))
|
||||
else:
|
||||
if not checksum == doc.archive_checksum:
|
||||
messages.append(SanityError(
|
||||
f"Checksum mismatch of archive {doc.pk}. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}."
|
||||
))
|
||||
|
||||
# other document checks
|
||||
if not doc.content:
|
||||
messages.append(SanityWarning(
|
||||
f"Document {doc.pk} has no content."
|
||||
|
@ -1,6 +1,9 @@
|
||||
import magic
|
||||
from pathvalidate import validate_filename, ValidationError
|
||||
from rest_framework import serializers
|
||||
|
||||
from .models import Correspondent, Tag, Document, Log, DocumentType
|
||||
from .parsers import is_mime_type_supported
|
||||
|
||||
|
||||
class CorrespondentSerializer(serializers.HyperlinkedModelSerializer):
|
||||
@ -76,11 +79,9 @@ class DocumentTypeField(serializers.PrimaryKeyRelatedField):
|
||||
|
||||
class DocumentSerializer(serializers.ModelSerializer):
|
||||
|
||||
correspondent_id = CorrespondentField(
|
||||
allow_null=True, source='correspondent')
|
||||
tags_id = TagsField(many=True, source='tags')
|
||||
document_type_id = DocumentTypeField(
|
||||
allow_null=True, source='document_type')
|
||||
correspondent = CorrespondentField(allow_null=True)
|
||||
tags = TagsField(many=True)
|
||||
document_type = DocumentTypeField(allow_null=True)
|
||||
|
||||
class Meta:
|
||||
model = Document
|
||||
@ -88,13 +89,10 @@ class DocumentSerializer(serializers.ModelSerializer):
|
||||
fields = (
|
||||
"id",
|
||||
"correspondent",
|
||||
"correspondent_id",
|
||||
"document_type",
|
||||
"document_type_id",
|
||||
"title",
|
||||
"content",
|
||||
"tags",
|
||||
"tags_id",
|
||||
"created",
|
||||
"modified",
|
||||
"added",
|
||||
@ -113,3 +111,84 @@ class LogSerializer(serializers.ModelSerializer):
|
||||
"group",
|
||||
"level"
|
||||
)
|
||||
|
||||
|
||||
class PostDocumentSerializer(serializers.Serializer):
|
||||
|
||||
document = serializers.FileField(
|
||||
label="Document",
|
||||
write_only=True,
|
||||
)
|
||||
|
||||
title = serializers.CharField(
|
||||
label="Title",
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
correspondent = serializers.PrimaryKeyRelatedField(
|
||||
queryset=Correspondent.objects.all(),
|
||||
label="Correspondent",
|
||||
allow_null=True,
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
document_type = serializers.PrimaryKeyRelatedField(
|
||||
queryset=DocumentType.objects.all(),
|
||||
label="Document type",
|
||||
allow_null=True,
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
tags = serializers.PrimaryKeyRelatedField(
|
||||
many=True,
|
||||
queryset=Tag.objects.all(),
|
||||
label="Tags",
|
||||
write_only=True,
|
||||
required=False,
|
||||
)
|
||||
|
||||
def validate(self, attrs):
|
||||
document = attrs.get('document')
|
||||
|
||||
try:
|
||||
validate_filename(document.name)
|
||||
except ValidationError:
|
||||
raise serializers.ValidationError("Invalid filename.")
|
||||
|
||||
document_data = document.file.read()
|
||||
mime_type = magic.from_buffer(document_data, mime=True)
|
||||
|
||||
if not is_mime_type_supported(mime_type):
|
||||
raise serializers.ValidationError(
|
||||
"This mime type is not supported.")
|
||||
|
||||
attrs['document_data'] = document_data
|
||||
|
||||
title = attrs.get('title')
|
||||
|
||||
if not title:
|
||||
attrs['title'] = None
|
||||
|
||||
correspondent = attrs.get('correspondent')
|
||||
if correspondent:
|
||||
attrs['correspondent_id'] = correspondent.id
|
||||
else:
|
||||
attrs['correspondent_id'] = None
|
||||
|
||||
document_type = attrs.get('document_type')
|
||||
if document_type:
|
||||
attrs['document_type_id'] = document_type.id
|
||||
else:
|
||||
attrs['document_type_id'] = None
|
||||
|
||||
tags = attrs.get('tags')
|
||||
if tags:
|
||||
tag_ids = [tag.id for tag in tags]
|
||||
attrs['tag_ids'] = tag_ids
|
||||
else:
|
||||
attrs['tag_ids'] = None
|
||||
|
||||
return attrs
|
||||
|
@ -2,3 +2,4 @@
|
||||
# for exporting/importing commands
|
||||
EXPORTER_FILE_NAME = "__exported_file_name__"
|
||||
EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
|
||||
EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"
|
||||
|
@ -13,7 +13,7 @@ from rest_framework.reverse import reverse
|
||||
|
||||
from .. import index, matching
|
||||
from ..file_handling import delete_empty_directories, generate_filename, \
|
||||
create_source_path_directory
|
||||
create_source_path_directory, archive_name_from_filename
|
||||
from ..models import Document, Tag
|
||||
|
||||
|
||||
@ -169,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs):
|
||||
|
||||
@receiver(models.signals.post_delete, sender=Document)
|
||||
def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
for f in (instance.source_path, instance.thumbnail_path):
|
||||
try:
|
||||
os.unlink(f)
|
||||
except FileNotFoundError:
|
||||
pass # The file's already gone, so we're cool with it.
|
||||
for f in (instance.source_path,
|
||||
instance.archive_path,
|
||||
instance.thumbnail_path):
|
||||
if os.path.isfile(f):
|
||||
try:
|
||||
os.unlink(f)
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Deleted file {f}.")
|
||||
except OSError as e:
|
||||
logging.getLogger(__name__).warning(
|
||||
f"While deleting document {instance.file_name}, the file "
|
||||
f"{f} could not be deleted: {e}"
|
||||
)
|
||||
|
||||
delete_empty_directories(os.path.dirname(instance.source_path))
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.source_path),
|
||||
root=settings.ORIGINALS_DIR
|
||||
)
|
||||
|
||||
delete_empty_directories(
|
||||
os.path.dirname(instance.archive_path),
|
||||
root=settings.ARCHIVE_DIR
|
||||
)
|
||||
|
||||
|
||||
def validate_move(instance, old_path, new_path):
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
return False
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
|
||||
@ -183,55 +216,91 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
|
||||
def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
|
||||
if not instance.filename:
|
||||
# Can't update the filename if there is not filename to begin with
|
||||
# This happens after the consumer creates a new document.
|
||||
# The PK needs to be set first by saving the document once. When this
|
||||
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
|
||||
# renamed anyway. In all other cases, instance.filename will be set.
|
||||
# Can't update the filename if there is no filename to begin with
|
||||
# This happens when the consumer creates a new document.
|
||||
# The document is modified and saved multiple times, and only after
|
||||
# everything is done (i.e., the generated filename is final),
|
||||
# filename will be set to the location where the consumer has put
|
||||
# the file.
|
||||
#
|
||||
# This will in turn cause this logic to move the file where it belongs.
|
||||
return
|
||||
|
||||
old_filename = instance.filename
|
||||
old_path = instance.source_path
|
||||
new_filename = generate_filename(instance)
|
||||
|
||||
if new_filename == instance.filename:
|
||||
# Don't do anything if its the same.
|
||||
return
|
||||
|
||||
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
old_source_path = instance.source_path
|
||||
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
# Can't do anything if the old file does not exist anymore.
|
||||
logging.getLogger(__name__).fatal(
|
||||
f"Document {str(instance)}: File {old_path} has gone.")
|
||||
if not validate_move(instance, old_source_path, new_source_path):
|
||||
return
|
||||
|
||||
if os.path.isfile(new_path):
|
||||
# Can't do anything if the new file already exists. Skip updating file.
|
||||
logging.getLogger(__name__).warning(
|
||||
f"Document {str(instance)}: Cannot rename file "
|
||||
f"since target path {new_path} already exists.")
|
||||
return
|
||||
# archive files are optional, archive checksum tells us if we have one,
|
||||
# since this is None for documents without archived files.
|
||||
if instance.archive_checksum:
|
||||
new_archive_filename = archive_name_from_filename(new_filename)
|
||||
old_archive_path = instance.archive_path
|
||||
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
|
||||
new_archive_filename)
|
||||
|
||||
create_source_path_directory(new_path)
|
||||
if not validate_move(instance, old_archive_path, new_archive_path):
|
||||
return
|
||||
|
||||
create_source_path_directory(new_archive_path)
|
||||
else:
|
||||
old_archive_path = None
|
||||
new_archive_path = None
|
||||
|
||||
create_source_path_directory(new_source_path)
|
||||
|
||||
try:
|
||||
os.rename(old_path, new_path)
|
||||
os.rename(old_source_path, new_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(old_archive_path, new_archive_path)
|
||||
instance.filename = new_filename
|
||||
# Don't save here to prevent infinite recursion.
|
||||
Document.objects.filter(pk=instance.pk).update(filename=new_filename)
|
||||
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_path} to {new_path}.")
|
||||
f"Moved file {old_source_path} to {new_source_path}.")
|
||||
|
||||
if instance.archive_checksum:
|
||||
logging.getLogger(__name__).debug(
|
||||
f"Moved file {old_archive_path} to {new_archive_path}.")
|
||||
|
||||
except OSError as e:
|
||||
instance.filename = old_filename
|
||||
# this happens when we can't move a file. If that's the case for the
|
||||
# archive file, we try our best to revert the changes.
|
||||
try:
|
||||
os.rename(new_source_path, old_source_path)
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
except Exception as e:
|
||||
# This is fine, since:
|
||||
# A: if we managed to move source from A to B, we will also manage
|
||||
# to move it from B to A. If not, we have a serious issue
|
||||
# that's going to get caught by the santiy checker.
|
||||
# all files remain in place and will never be overwritten,
|
||||
# so this is not the end of the world.
|
||||
# B: if moving the orignal file failed, nothing has changed anyway.
|
||||
pass
|
||||
except DatabaseError as e:
|
||||
os.rename(new_path, old_path)
|
||||
os.rename(new_source_path, old_source_path)
|
||||
if instance.archive_checksum:
|
||||
os.rename(new_archive_path, old_archive_path)
|
||||
instance.filename = old_filename
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
delete_empty_directories(os.path.dirname(old_path))
|
||||
if not os.path.isfile(old_source_path):
|
||||
delete_empty_directories(os.path.dirname(old_source_path),
|
||||
root=settings.ORIGINALS_DIR)
|
||||
|
||||
if old_archive_path and not os.path.isfile(old_archive_path):
|
||||
delete_empty_directories(os.path.dirname(old_archive_path),
|
||||
root=settings.ARCHIVE_DIR)
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
|
@ -13,8 +13,8 @@ from documents.sanity_checker import SanityFailedError
|
||||
|
||||
def index_optimize():
|
||||
ix = index.open_index()
|
||||
with AsyncWriter(ix) as writer:
|
||||
writer.commit(optimize=True)
|
||||
writer = AsyncWriter(ix)
|
||||
writer.commit(optimize=True)
|
||||
|
||||
|
||||
def index_reindex():
|
||||
|
Before Width: | Height: | Size: 32 KiB After Width: | Height: | Size: 32 KiB |
BIN
src/documents/tests/samples/documents/archive/0000001.pdf
Normal file
BIN
src/documents/tests/samples/documents/archive/0000001.pdf
Normal file
Binary file not shown.
@ -12,10 +12,10 @@ from documents.models import Document, Correspondent, DocumentType, Tag
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(DocumentApiTest, self).setUp()
|
||||
super(TestDocumentApi, self).setUp()
|
||||
|
||||
user = User.objects.create_superuser(username="temp_admin")
|
||||
self.client.force_login(user=user)
|
||||
@ -41,20 +41,13 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
returned_doc = response.data['results'][0]
|
||||
self.assertEqual(returned_doc['id'], doc.id)
|
||||
self.assertEqual(returned_doc['title'], doc.title)
|
||||
self.assertEqual(returned_doc['correspondent']['name'], c.name)
|
||||
self.assertEqual(returned_doc['document_type']['name'], dt.name)
|
||||
self.assertEqual(returned_doc['correspondent']['id'], c.id)
|
||||
self.assertEqual(returned_doc['document_type']['id'], dt.id)
|
||||
self.assertEqual(returned_doc['correspondent']['id'], returned_doc['correspondent_id'])
|
||||
self.assertEqual(returned_doc['document_type']['id'], returned_doc['document_type_id'])
|
||||
self.assertEqual(len(returned_doc['tags']), 1)
|
||||
self.assertEqual(returned_doc['tags'][0]['name'], tag.name)
|
||||
self.assertEqual(returned_doc['tags'][0]['id'], tag.id)
|
||||
self.assertListEqual(returned_doc['tags_id'], [tag.id])
|
||||
self.assertEqual(returned_doc['correspondent'], c.id)
|
||||
self.assertEqual(returned_doc['document_type'], dt.id)
|
||||
self.assertListEqual(returned_doc['tags'], [tag.id])
|
||||
|
||||
c2 = Correspondent.objects.create(name="c2")
|
||||
|
||||
returned_doc['correspondent_id'] = c2.pk
|
||||
returned_doc['correspondent'] = c2.pk
|
||||
returned_doc['title'] = "the new title"
|
||||
|
||||
response = self.client.put('/api/documents/{}/'.format(doc.pk), returned_doc, format='json')
|
||||
@ -100,6 +93,44 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_thumbnail)
|
||||
|
||||
def test_download_with_archive(self):
|
||||
|
||||
_, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
|
||||
|
||||
content = b"This is a test"
|
||||
content_archive = b"This is the same test but archived"
|
||||
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
filename = os.path.basename(filename)
|
||||
|
||||
doc = Document.objects.create(title="none", filename=filename,
|
||||
mime_type="application/pdf")
|
||||
|
||||
with open(doc.archive_path, "wb") as f:
|
||||
f.write(content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content_archive)
|
||||
|
||||
response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk))
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(response.content, content)
|
||||
|
||||
def test_document_actions_not_existing_file(self):
|
||||
|
||||
doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
|
||||
@ -289,6 +320,22 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertEqual(len(response.data), 10)
|
||||
|
||||
def test_search_spelling_correction(self):
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
for i in range(55):
|
||||
doc = Document.objects.create(checksum=str(i), pk=i+1, title=f"Document {i+1}", content=f"Things document {i+1}")
|
||||
index.update_document(writer, doc)
|
||||
|
||||
response = self.client.get("/api/search/?query=thing")
|
||||
correction = response.data['corrected_query']
|
||||
|
||||
self.assertEqual(correction, "things")
|
||||
|
||||
response = self.client.get("/api/search/?query=things")
|
||||
correction = response.data['corrected_query']
|
||||
|
||||
self.assertEqual(correction, None)
|
||||
|
||||
def test_statistics(self):
|
||||
|
||||
doc1 = Document.objects.create(title="none1", checksum="A")
|
||||
@ -304,7 +351,7 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.data['documents_total'], 3)
|
||||
self.assertEqual(response.data['documents_inbox'], 1)
|
||||
|
||||
@mock.patch("documents.forms.async_task")
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
@ -316,8 +363,12 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
|
||||
args, kwargs = m.call_args
|
||||
self.assertEqual(kwargs['override_filename'], "simple.pdf")
|
||||
self.assertIsNone(kwargs['override_title'])
|
||||
self.assertIsNone(kwargs['override_correspondent_id'])
|
||||
self.assertIsNone(kwargs['override_document_type_id'])
|
||||
self.assertIsNone(kwargs['override_tag_ids'])
|
||||
|
||||
@mock.patch("documents.forms.async_task")
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_invalid_form(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
@ -325,7 +376,7 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 400)
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.forms.async_task")
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_invalid_file(self, m):
|
||||
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.zip"), "rb") as f:
|
||||
@ -333,8 +384,8 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 400)
|
||||
m.assert_not_called()
|
||||
|
||||
@mock.patch("documents.forms.async_task")
|
||||
@mock.patch("documents.forms.validate_filename")
|
||||
@mock.patch("documents.views.async_task")
|
||||
@mock.patch("documents.serialisers.validate_filename")
|
||||
def test_upload_invalid_filename(self, validate_filename, async_task):
|
||||
validate_filename.side_effect = ValidationError()
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
@ -342,3 +393,85 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_title(self, async_task):
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "title": "my custom title"})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
async_task.assert_called_once()
|
||||
|
||||
args, kwargs = async_task.call_args
|
||||
|
||||
self.assertEqual(kwargs['override_title'], "my custom title")
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_correspondent(self, async_task):
|
||||
c = Correspondent.objects.create(name="test-corres")
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": c.id})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
async_task.assert_called_once()
|
||||
|
||||
args, kwargs = async_task.call_args
|
||||
|
||||
self.assertEqual(kwargs['override_correspondent_id'], c.id)
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_invalid_correspondent(self, async_task):
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": 3456})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_document_type(self, async_task):
|
||||
dt = DocumentType.objects.create(name="invoice")
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": dt.id})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
async_task.assert_called_once()
|
||||
|
||||
args, kwargs = async_task.call_args
|
||||
|
||||
self.assertEqual(kwargs['override_document_type_id'], dt.id)
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_invalid_document_type(self, async_task):
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": 34578})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_tags(self, async_task):
|
||||
t1 = Tag.objects.create(name="tag1")
|
||||
t2 = Tag.objects.create(name="tag2")
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post(
|
||||
"/api/documents/post_document/",
|
||||
{"document": f, "tags": [t2.id, t1.id]})
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
async_task.assert_called_once()
|
||||
|
||||
args, kwargs = async_task.call_args
|
||||
|
||||
self.assertCountEqual(kwargs['override_tag_ids'], [t1.id, t2.id])
|
||||
|
||||
@mock.patch("documents.views.async_task")
|
||||
def test_upload_with_invalid_tags(self, async_task):
|
||||
t1 = Tag.objects.create(name="tag1")
|
||||
t2 = Tag.objects.create(name="tag2")
|
||||
with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f:
|
||||
response = self.client.post(
|
||||
"/api/documents/post_document/",
|
||||
{"document": f, "tags": [t2.id, t1.id, 734563]})
|
||||
self.assertEqual(response.status_code, 400)
|
||||
|
||||
async_task.assert_not_called()
|
||||
|
@ -1,5 +1,6 @@
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase):
|
||||
|
||||
class DummyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(DummyParser, self).__init__(path, logging_group)
|
||||
def __init__(self, logging_group, scratch_dir, archive_path):
|
||||
super(DummyParser, self).__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
self.archive_path = archive_path
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
return "The Text"
|
||||
def parse(self, document_path, mime_type):
|
||||
self.text = "The Text"
|
||||
|
||||
|
||||
class FaultyParser(DocumentParser):
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
# not important during tests
|
||||
raise NotImplementedError()
|
||||
|
||||
def __init__(self, path, logging_group, scratch_dir):
|
||||
super(FaultyParser, self).__init__(path, logging_group)
|
||||
def __init__(self, logging_group, scratch_dir):
|
||||
super(FaultyParser, self).__init__(logging_group)
|
||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
||||
|
||||
def get_optimised_thumbnail(self):
|
||||
def get_optimised_thumbnail(self, document_path, mime_type):
|
||||
return self.fake_thumb
|
||||
|
||||
def get_text(self):
|
||||
def parse(self, document_path, mime_type):
|
||||
raise ParseError("Does not compute.")
|
||||
|
||||
|
||||
@ -410,11 +412,11 @@ def fake_magic_from_file(file, mime=False):
|
||||
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
||||
class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_dummy_parser(self, path, logging_group):
|
||||
return DummyParser(path, logging_group, self.dirs.scratch_dir)
|
||||
def make_dummy_parser(self, logging_group):
|
||||
return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())
|
||||
|
||||
def make_faulty_parser(self, path, logging_group):
|
||||
return FaultyParser(path, logging_group, self.dirs.scratch_dir)
|
||||
def make_faulty_parser(self, logging_group):
|
||||
return FaultyParser(logging_group, self.dirs.scratch_dir)
|
||||
|
||||
def setUp(self):
|
||||
super(TestConsumer, self).setUp()
|
||||
@ -423,7 +425,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
m = patcher.start()
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_dummy_parser,
|
||||
"mime_types": ["application/pdf"],
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
@ -432,9 +434,18 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
self.consumer = Consumer()
|
||||
|
||||
def get_test_file(self):
|
||||
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir)
|
||||
return f
|
||||
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf")
|
||||
dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
def get_test_archive_file(self):
|
||||
src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf")
|
||||
dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf")
|
||||
shutil.copy(src, dst)
|
||||
return dst
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
|
||||
def testNormalOperation(self):
|
||||
|
||||
filename = self.get_test_file()
|
||||
@ -454,6 +465,13 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
document.thumbnail_path
|
||||
))
|
||||
|
||||
self.assertTrue(os.path.isfile(
|
||||
document.archive_path
|
||||
))
|
||||
|
||||
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
|
||||
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
|
||||
|
||||
self.assertFalse(os.path.isfile(filename))
|
||||
|
||||
def testOverrideFilename(self):
|
||||
@ -501,7 +519,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates(self):
|
||||
def testDuplicates1(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
@ -512,6 +530,21 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates2(self):
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_archive_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).endswith("It is a duplicate."))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
|
||||
def testDuplicates3(self):
|
||||
self.consumer.try_consume_file(self.get_test_archive_file())
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
|
||||
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
||||
def testNoParsers(self, m):
|
||||
m.return_value = []
|
||||
@ -519,7 +552,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
try:
|
||||
self.consumer.try_consume_file(self.get_test_file())
|
||||
except ConsumerError as e:
|
||||
self.assertTrue(str(e).startswith("No parsers abvailable"))
|
||||
self.assertTrue("No parsers abvailable for" in str(e))
|
||||
return
|
||||
|
||||
self.fail("Should throw exception")
|
||||
@ -528,7 +561,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
def testFaultyParser(self, m):
|
||||
m.return_value = [(None, {
|
||||
"parser": self.make_faulty_parser,
|
||||
"mime_types": ["application/pdf"],
|
||||
"mime_types": {"application/pdf": ".pdf"},
|
||||
"weight": 0
|
||||
})]
|
||||
|
||||
|
140
src/documents/tests/test_date_parsing.py
Normal file
140
src/documents/tests/test_date_parsing.py
Normal file
@ -0,0 +1,140 @@
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from dateutil import tz
|
||||
from django.conf import settings
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.parsers import parse_date
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
|
||||
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples")
|
||||
SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
|
||||
def setUp(self):
|
||||
os.makedirs(self.SCRATCH, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.SCRATCH)
|
||||
|
||||
def test_date_format_1(self):
|
||||
text = "lorem ipsum 130218 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_2(self):
|
||||
text = "lorem ipsum 2018 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_3(self):
|
||||
text = "lorem ipsum 20180213 lorem ipsum"
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_4(self):
|
||||
text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_5(self):
|
||||
text = (
|
||||
"lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
|
||||
"ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2018, 2, 13, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_6(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(parse_date("", text), None)
|
||||
|
||||
def test_date_format_7(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"März 2019\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
date = parse_date("", text)
|
||||
self.assertEqual(
|
||||
date,
|
||||
datetime.datetime(
|
||||
2019, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_date_format_8(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020"
|
||||
)
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
@override_settings(SCRATCH_DIR=SCRATCH)
|
||||
def test_date_format_9(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"27. Nullmonth 2020\n"
|
||||
"März 2020\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(
|
||||
2020, 3, 1, 0, 0,
|
||||
tzinfo=tz.gettz(settings.TIME_ZONE)
|
||||
)
|
||||
)
|
||||
|
||||
def test_crazy_date_past(self, *args):
|
||||
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
|
||||
|
||||
def test_crazy_date_future(self, *args):
|
||||
self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
|
||||
|
||||
def test_crazy_date_with_spaces(self, *args):
|
||||
self.assertIsNone(parse_date("", "20 408000l 2475"))
|
||||
|
||||
@override_settings(FILENAME_DATE_ORDER="YMD")
|
||||
def test_filename_date_parse_invalid(self, *args):
|
||||
self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
|
@ -1,12 +1,29 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
class TestDocument(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
self.originals_dir = tempfile.mkdtemp()
|
||||
self.thumb_dir = tempfile.mkdtemp()
|
||||
|
||||
override_settings(
|
||||
ORIGINALS_DIR=self.originals_dir,
|
||||
THUMBNAIL_DIR=self.thumb_dir,
|
||||
).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.originals_dir)
|
||||
shutil.rmtree(self.thumb_dir)
|
||||
|
||||
def test_file_deletion(self):
|
||||
document = Document.objects.create(
|
||||
correspondent=Correspondent.objects.create(name="Test0"),
|
||||
@ -19,8 +36,31 @@ class TestDocument(TestCase):
|
||||
file_path = document.source_path
|
||||
thumb_path = document.thumbnail_path
|
||||
|
||||
Path(file_path).touch()
|
||||
Path(thumb_path).touch()
|
||||
|
||||
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
|
||||
document.delete()
|
||||
mock_unlink.assert_any_call(file_path)
|
||||
mock_unlink.assert_any_call(thumb_path)
|
||||
self.assertEqual(mock_unlink.call_count, 2)
|
||||
|
||||
def test_file_name(self):
|
||||
|
||||
doc = Document(mime_type="application/pdf", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.pdf")
|
||||
|
||||
def test_file_name_jpg(self):
|
||||
|
||||
doc = Document(mime_type="image/jpeg", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.jpg")
|
||||
|
||||
def test_file_name_unknown(self):
|
||||
|
||||
doc = Document(mime_type="application/zip", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test.zip")
|
||||
|
||||
def test_file_name_invalid(self):
|
||||
|
||||
doc = Document(mime_type="image/jpegasd", title="test", created=datetime(2020, 12, 25))
|
||||
self.assertEqual(doc.file_name, "20201225-test")
|
||||
|
@ -2,32 +2,17 @@ import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
from uuid import uuid4
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import DatabaseError
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from .utils import DirectoriesMixin
|
||||
from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
|
||||
from ..models import Document, Correspondent
|
||||
|
||||
|
||||
class TestDate(TestCase):
|
||||
deletion_list = []
|
||||
|
||||
def add_to_deletion_list(self, dirname):
|
||||
self.deletion_list.append(dirname)
|
||||
|
||||
def setUp(self):
|
||||
folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
|
||||
os.makedirs(folder + "/documents/originals")
|
||||
override_settings(MEDIA_ROOT=folder).enable()
|
||||
override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
|
||||
self.add_to_deletion_list(folder)
|
||||
|
||||
def tearDown(self):
|
||||
for dirname in self.deletion_list:
|
||||
shutil.rmtree(dirname, ignore_errors=True)
|
||||
class TestFileHandling(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="")
|
||||
def test_generate_source_filename(self):
|
||||
@ -104,7 +89,7 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
|
||||
@ -140,7 +125,7 @@ class TestDate(TestCase):
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertTrue(os.path.isfile(document.source_path))
|
||||
self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
|
||||
self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
|
||||
@ -196,8 +181,8 @@ class TestDate(TestCase):
|
||||
document.save()
|
||||
|
||||
# Check proper handling of files
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
|
||||
self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
|
||||
self.assertTrue(os.path.isfile(important_file))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
|
||||
@ -315,13 +300,12 @@ class TestDate(TestCase):
|
||||
# Create our working directory
|
||||
tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
|
||||
os.makedirs(tmp)
|
||||
self.add_to_deletion_list(tmp)
|
||||
|
||||
os.makedirs(os.path.join(tmp, "notempty"))
|
||||
Path(os.path.join(tmp, "notempty", "file")).touch()
|
||||
os.makedirs(os.path.join(tmp, "notempty", "empty"))
|
||||
|
||||
delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
|
||||
delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
|
||||
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
|
||||
self.assertEqual(os.path.isfile(
|
||||
os.path.join(tmp, "notempty", "file")), True)
|
||||
@ -345,3 +329,159 @@ class TestDate(TestCase):
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
||||
|
||||
class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT=None)
|
||||
def test_create_no_format(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_create_with_format(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
|
||||
self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_archive_gone(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
#Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertFalse(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_archive_exists(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
|
||||
Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.os.rename")
|
||||
def test_move_archive_error(self, m):
|
||||
|
||||
def fake_rename(src, dst):
|
||||
if "archive" in src:
|
||||
raise OSError()
|
||||
else:
|
||||
os.remove(src)
|
||||
Path(dst).touch()
|
||||
|
||||
m.side_effect = fake_rename
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_move_file_gone(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
#Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertFalse(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
@mock.patch("documents.signals.handlers.os.rename")
|
||||
def test_move_file_error(self, m):
|
||||
|
||||
def fake_rename(src, dst):
|
||||
if "original" in src:
|
||||
raise OSError()
|
||||
else:
|
||||
os.remove(src)
|
||||
Path(dst).touch()
|
||||
|
||||
m.side_effect = fake_rename
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
def test_archive_deleted(self):
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
||||
doc.delete()
|
||||
|
||||
self.assertFalse(os.path.isfile(original))
|
||||
self.assertFalse(os.path.isfile(archive))
|
||||
self.assertFalse(os.path.isfile(doc.source_path))
|
||||
self.assertFalse(os.path.isfile(doc.archive_path))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
||||
def test_database_error(self):
|
||||
|
||||
original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
|
||||
archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
|
||||
Path(original).touch()
|
||||
Path(archive).touch()
|
||||
doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
|
||||
with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
|
||||
m.side_effect = DatabaseError()
|
||||
doc.save()
|
||||
|
||||
self.assertTrue(os.path.isfile(original))
|
||||
self.assertTrue(os.path.isfile(archive))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
|
42
src/documents/tests/test_management_archiver.py
Normal file
42
src/documents/tests/test_management_archiver.py
Normal file
@ -0,0 +1,42 @@
|
||||
import filecmp
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.management.commands.document_archiver import handle_document
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
|
||||
|
||||
class TestArchiver(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_models(self):
|
||||
self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
|
||||
#self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
|
||||
#self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
|
||||
|
||||
def test_archiver(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
|
||||
call_command('document_archiver')
|
||||
|
||||
def test_handle_document(self):
|
||||
|
||||
shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
self.make_models()
|
||||
|
||||
handle_document(self.d1.pk)
|
||||
|
||||
doc = Document.objects.get(id=self.d1.id)
|
||||
|
||||
self.assertIsNotNone(doc.checksum)
|
||||
self.assertTrue(os.path.isfile(doc.archive_path))
|
||||
self.assertTrue(os.path.isfile(doc.source_path))
|
||||
self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
|
@ -7,8 +7,9 @@ from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
from django.core.management import call_command, CommandError
|
||||
from django.test import override_settings, TestCase
|
||||
from django.test import override_settings, TransactionTestCase
|
||||
|
||||
from documents.models import Tag
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.management.commands import document_consumer
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
@ -33,12 +34,12 @@ def chunked(size, source):
|
||||
yield source[i:i+size]
|
||||
|
||||
|
||||
class TestConsumer(DirectoriesMixin, TestCase):
|
||||
class ConsumerMixin:
|
||||
|
||||
sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
|
||||
|
||||
def setUp(self) -> None:
|
||||
super(TestConsumer, self).setUp()
|
||||
super(ConsumerMixin, self).setUp()
|
||||
self.t = None
|
||||
patcher = mock.patch("documents.management.commands.document_consumer.async_task")
|
||||
self.task_mock = patcher.start()
|
||||
@ -57,7 +58,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
# wait for the consumer to exit.
|
||||
self.t.join()
|
||||
|
||||
super(TestConsumer, self).tearDown()
|
||||
super(ConsumerMixin, self).tearDown()
|
||||
|
||||
def wait_for_task_mock_call(self):
|
||||
n = 0
|
||||
@ -68,7 +69,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
return
|
||||
n += 1
|
||||
sleep(0.1)
|
||||
self.fail("async_task was never called")
|
||||
|
||||
# A bogus async_task that will simply check the file for
|
||||
# completeness and raise an exception otherwise.
|
||||
@ -95,6 +95,9 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
sleep(0.1)
|
||||
print("file completed.")
|
||||
|
||||
|
||||
class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
|
||||
def test_consume_file(self):
|
||||
self.t_start()
|
||||
|
||||
@ -108,9 +111,15 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], f)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_consume_file_polling(self):
|
||||
self.test_consume_file()
|
||||
def test_consume_file_invalid_ext(self):
|
||||
self.t_start()
|
||||
|
||||
f = os.path.join(self.dirs.consumption_dir, "my_file.wow")
|
||||
shutil.copy(self.sample_file, f)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_not_called()
|
||||
|
||||
def test_consume_existing_file(self):
|
||||
f = os.path.join(self.dirs.consumption_dir, "my_file.pdf")
|
||||
@ -122,10 +131,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], f)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_consume_existing_file_polling(self):
|
||||
self.test_consume_existing_file()
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_pdf(self, error_logger):
|
||||
|
||||
@ -146,10 +151,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], fname)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_slow_write_pdf_polling(self):
|
||||
self.test_slow_write_pdf()
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_and_move(self, error_logger):
|
||||
|
||||
@ -172,10 +173,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
error_logger.assert_not_called()
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_slow_write_and_move_polling(self):
|
||||
self.test_slow_write_and_move()
|
||||
|
||||
@mock.patch("documents.management.commands.document_consumer.logger.error")
|
||||
def test_slow_write_incomplete(self, error_logger):
|
||||
|
||||
@ -195,10 +192,6 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
# assert that we have an error logged with this invalid file.
|
||||
error_logger.assert_called_once()
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_slow_write_incomplete_polling(self):
|
||||
self.test_slow_write_incomplete()
|
||||
|
||||
@override_settings(CONSUMPTION_DIR="does_not_exist")
|
||||
def test_consumption_directory_invalid(self):
|
||||
|
||||
@ -208,3 +201,62 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
def test_consumption_directory_unset(self):
|
||||
|
||||
self.assertRaises(CommandError, call_command, 'document_consumer', '--oneshot')
|
||||
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
class TestConsumerPolling(TestConsumer):
|
||||
# just do all the tests with polling
|
||||
pass
|
||||
|
||||
|
||||
@override_settings(CONSUMER_RECURSIVE=True)
|
||||
class TestConsumerRecursive(TestConsumer):
|
||||
# just do all the tests with recursive
|
||||
pass
|
||||
|
||||
|
||||
@override_settings(CONSUMER_RECURSIVE=True)
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
class TestConsumerRecursivePolling(TestConsumer):
|
||||
# just do all the tests with polling and recursive
|
||||
pass
|
||||
|
||||
|
||||
class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
|
||||
|
||||
@override_settings(CONSUMER_RECURSIVE=True)
|
||||
@override_settings(CONSUMER_SUBDIRS_AS_TAGS=True)
|
||||
def test_consume_file_with_path_tags(self):
|
||||
|
||||
tag_names = ("existingTag", "Space Tag")
|
||||
# Create a Tag prior to consuming a file using it in path
|
||||
tag_ids = [Tag.objects.create(name=tag_names[0]).pk,]
|
||||
|
||||
self.t_start()
|
||||
|
||||
path = os.path.join(self.dirs.consumption_dir, *tag_names)
|
||||
os.makedirs(path, exist_ok=True)
|
||||
f = os.path.join(path, "my_file.pdf")
|
||||
# Wait at least inotify read_delay for recursive watchers
|
||||
# to be created for the new directories
|
||||
sleep(1)
|
||||
shutil.copy(self.sample_file, f)
|
||||
|
||||
self.wait_for_task_mock_call()
|
||||
|
||||
self.task_mock.assert_called_once()
|
||||
|
||||
# Add the pk of the Tag created by _consume()
|
||||
tag_ids.append(Tag.objects.get(name=tag_names[1]).pk)
|
||||
|
||||
args, kwargs = self.task_mock.call_args
|
||||
self.assertEqual(args[1], f)
|
||||
|
||||
# assertCountEqual has a bad name, but test that the first
|
||||
# sequence contains the same elements as second, regardless of
|
||||
# their order.
|
||||
self.assertCountEqual(kwargs["override_tag_ids"], tag_ids)
|
||||
|
||||
@override_settings(CONSUMER_POLLING=1)
|
||||
def test_consume_file_with_path_tags_polling(self):
|
||||
self.test_consume_file_with_path_tags()
|
||||
|
@ -17,7 +17,8 @@ class TestDecryptDocuments(TestCase):
|
||||
@override_settings(
|
||||
ORIGINALS_DIR=os.path.join(os.path.dirname(__file__), "samples", "originals"),
|
||||
THUMBNAIL_DIR=os.path.join(os.path.dirname(__file__), "samples", "thumb"),
|
||||
PASSPHRASE="test"
|
||||
PASSPHRASE="test",
|
||||
PAPERLESS_FILENAME_FORMAT=None
|
||||
)
|
||||
@mock.patch("documents.management.commands.decrypt_documents.input")
|
||||
def test_decrypt(self, m):
|
||||
|
@ -9,10 +9,11 @@ from django.test import TestCase, override_settings
|
||||
|
||||
from documents.management.commands import document_exporter
|
||||
from documents.models import Document, Tag, DocumentType, Correspondent
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.sanity_checker import check_sanity
|
||||
from documents.tests.utils import DirectoriesMixin, paperless_environment
|
||||
|
||||
|
||||
class TestExporter(DirectoriesMixin, TestCase):
|
||||
class TestExportImport(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(
|
||||
PASSPHRASE="test"
|
||||
@ -23,11 +24,8 @@ class TestExporter(DirectoriesMixin, TestCase):
|
||||
|
||||
file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
|
||||
|
||||
with open(file, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
|
||||
Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
|
||||
Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
|
||||
Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
Tag.objects.create(name="t")
|
||||
DocumentType.objects.create(name="dt")
|
||||
Correspondent.objects.create(name="c")
|
||||
@ -51,6 +49,23 @@ class TestExporter(DirectoriesMixin, TestCase):
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['checksum'])
|
||||
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
|
||||
if document_exporter.EXPORTER_ARCHIVE_NAME in element:
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['archive_checksum'])
|
||||
|
||||
with paperless_environment() as dirs:
|
||||
call_command('document_importer', target)
|
||||
messages = check_sanity()
|
||||
# everything is alright after the test
|
||||
self.assertEqual(len(messages), 0, str([str(m) for m in messages]))
|
||||
|
||||
def test_export_missing_files(self):
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
call_command('document_exporter', target)
|
||||
Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
|
||||
self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
|
||||
|
@ -1,10 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
|
||||
from documents.parsers import get_parser_class
|
||||
from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
|
||||
get_parser_class_for_mime_type, DocumentParser, is_file_ext_supported
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_text.parsers import TextDocumentParser
|
||||
|
||||
|
||||
def fake_magic_from_file(file, mime=False):
|
||||
@ -27,7 +32,7 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, {"weight": 0, "parser": DummyParser, "mime_types": ["application/pdf"]}),
|
||||
(None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@ -45,8 +50,8 @@ class TestParserDiscovery(TestCase):
|
||||
pass
|
||||
|
||||
m.return_value = (
|
||||
(None, {"weight": 0, "parser": DummyParser1, "mime_types": ["application/pdf"]}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "mime_types": ["application/pdf"]}),
|
||||
(None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}),
|
||||
(None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}),
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@ -61,3 +66,57 @@ class TestParserDiscovery(TestCase):
|
||||
self.assertIsNone(
|
||||
get_parser_class("doc.pdf")
|
||||
)
|
||||
|
||||
|
||||
def fake_get_thumbnail(self, path, mimetype):
|
||||
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
|
||||
|
||||
|
||||
class TestBaseParser(TestCase):
|
||||
|
||||
def setUp(self) -> None:
|
||||
|
||||
self.scratch = tempfile.mkdtemp()
|
||||
override_settings(
|
||||
SCRATCH_DIR=self.scratch
|
||||
).enable()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.scratch)
|
||||
|
||||
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
|
||||
@override_settings(OPTIMIZE_THUMBNAILS=True)
|
||||
def test_get_optimised_thumbnail(self):
|
||||
parser = DocumentParser(None)
|
||||
|
||||
parser.get_optimised_thumbnail("any", "not important")
|
||||
|
||||
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
|
||||
@override_settings(OPTIMIZE_THUMBNAILS=False)
|
||||
def test_get_optimised_thumb_disabled(self):
|
||||
parser = DocumentParser(None)
|
||||
|
||||
path = parser.get_optimised_thumbnail("any", "not important")
|
||||
self.assertEqual(path, fake_get_thumbnail(None, None, None))
|
||||
|
||||
|
||||
class TestParserAvailability(TestCase):
|
||||
|
||||
def test_file_extensions(self):
|
||||
|
||||
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
|
||||
self.assertIn(ext, get_supported_file_extensions())
|
||||
self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
|
||||
self.assertEqual(get_default_file_extension('image/png'), ".png")
|
||||
self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
|
||||
self.assertEqual(get_default_file_extension('text/plain'), ".txt")
|
||||
self.assertEqual(get_default_file_extension('text/csv'), ".csv")
|
||||
self.assertEqual(get_default_file_extension('application/zip'), ".zip")
|
||||
self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")
|
||||
|
||||
self.assertEqual(get_parser_class_for_mime_type('application/pdf'), RasterisedDocumentParser)
|
||||
self.assertEqual(get_parser_class_for_mime_type('text/plain'), TextDocumentParser)
|
||||
self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
|
||||
|
||||
self.assertTrue(is_file_ext_supported('.pdf'))
|
||||
self.assertFalse(is_file_ext_supported('.hsdfh'))
|
||||
|
@ -32,7 +32,7 @@ class PostConsumeTestCase(TestCase):
|
||||
|
||||
@mock.patch("documents.signals.handlers.Popen")
|
||||
@override_settings(POST_CONSUME_SCRIPT="script")
|
||||
def test_post_consume_script_simple(self, m):
|
||||
def test_post_consume_script_with_correspondent(self, m):
|
||||
c = Correspondent.objects.create(name="my_bank")
|
||||
doc = Document.objects.create(title="Test", mime_type="application/pdf", correspondent=c)
|
||||
tag1 = Tag.objects.create(name="a")
|
||||
@ -53,5 +53,4 @@ class PostConsumeTestCase(TestCase):
|
||||
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
|
||||
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
|
||||
self.assertEqual(command[7], "my_bank")
|
||||
# TODO: tags are unordered by default.
|
||||
self.assertEqual(command[8], "a,b")
|
||||
self.assertCountEqual(command[8].split(","), ["a", "b"])
|
||||
|
87
src/documents/tests/test_sanity_check.py
Normal file
87
src/documents/tests/test_sanity_check.py
Normal file
@ -0,0 +1,87 @@
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import Document
|
||||
from documents.sanity_checker import check_sanity, SanityFailedError
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestSanityCheck(DirectoriesMixin, TestCase):
|
||||
|
||||
def make_test_data(self):
|
||||
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf"), os.path.join(self.dirs.originals_dir, "0000001.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf"), os.path.join(self.dirs.archive_dir, "0000001.pdf"))
|
||||
shutil.copy(os.path.join(os.path.dirname(__file__), "samples", "documents", "thumbnails", "0000001.png"), os.path.join(self.dirs.thumbnail_dir, "0000001.png"))
|
||||
|
||||
return Document.objects.create(title="test", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", content="test", pk=1, filename="0000001.pdf", mime_type="application/pdf")
|
||||
|
||||
def test_no_docs(self):
|
||||
self.assertEqual(len(check_sanity()), 0)
|
||||
|
||||
def test_success(self):
|
||||
self.make_test_data()
|
||||
self.assertEqual(len(check_sanity()), 0)
|
||||
|
||||
def test_no_thumbnail(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.thumbnail_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_thumbnail_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.thumbnail_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
os.chmod(doc.thumbnail_path, 0o777)
|
||||
|
||||
def test_no_original(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.source_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_original_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.source_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
os.chmod(doc.source_path, 0o777)
|
||||
|
||||
def test_original_checksum_mismatch(self):
|
||||
doc = self.make_test_data()
|
||||
doc.checksum = "WOW"
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_no_archive(self):
|
||||
doc = self.make_test_data()
|
||||
os.remove(doc.archive_path)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_archive_no_access(self):
|
||||
doc = self.make_test_data()
|
||||
os.chmod(doc.archive_path, 0o000)
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
os.chmod(doc.archive_path, 0o777)
|
||||
|
||||
def test_archive_checksum_mismatch(self):
|
||||
doc = self.make_test_data()
|
||||
doc.archive_checksum = "WOW"
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_empty_content(self):
|
||||
doc = self.make_test_data()
|
||||
doc.content = ""
|
||||
doc.save()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_orphaned_file(self):
|
||||
doc = self.make_test_data()
|
||||
Path(self.dirs.originals_dir, "orphaned").touch()
|
||||
self.assertEqual(len(check_sanity()), 1)
|
||||
|
||||
def test_all(self):
|
||||
Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf")
|
||||
string = str(SanityFailedError(check_sanity()))
|
24
src/documents/tests/test_tasks.py
Normal file
24
src/documents/tests/test_tasks.py
Normal file
@ -0,0 +1,24 @@
|
||||
from datetime import datetime
|
||||
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from documents import tasks
|
||||
from documents.models import Document
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
|
||||
|
||||
class TestTasks(DirectoriesMixin, TestCase):
|
||||
|
||||
def test_index_reindex(self):
|
||||
Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now())
|
||||
|
||||
tasks.index_reindex()
|
||||
|
||||
def test_index_optimize(self):
|
||||
Document.objects.create(title="test", content="my document", checksum="wow", added=timezone.now(), created=timezone.now(), modified=timezone.now())
|
||||
|
||||
tasks.index_optimize()
|
||||
|
||||
def test_train_classifier(self):
|
||||
tasks.train_classifier()
|
@ -2,6 +2,7 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from collections import namedtuple
|
||||
from contextlib import contextmanager
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
@ -17,22 +18,26 @@ def setup_directories():
|
||||
dirs.index_dir = os.path.join(dirs.data_dir, "index")
|
||||
dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
|
||||
dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
|
||||
dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
|
||||
|
||||
os.makedirs(dirs.index_dir, exist_ok=True)
|
||||
os.makedirs(dirs.originals_dir, exist_ok=True)
|
||||
os.makedirs(dirs.thumbnail_dir, exist_ok=True)
|
||||
os.makedirs(dirs.archive_dir, exist_ok=True)
|
||||
|
||||
override_settings(
|
||||
dirs.settings_override = override_settings(
|
||||
DATA_DIR=dirs.data_dir,
|
||||
SCRATCH_DIR=dirs.scratch_dir,
|
||||
MEDIA_ROOT=dirs.media_dir,
|
||||
ORIGINALS_DIR=dirs.originals_dir,
|
||||
THUMBNAIL_DIR=dirs.thumbnail_dir,
|
||||
ARCHIVE_DIR=dirs.archive_dir,
|
||||
CONSUMPTION_DIR=dirs.consumption_dir,
|
||||
INDEX_DIR=dirs.index_dir,
|
||||
MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
|
||||
|
||||
).enable()
|
||||
)
|
||||
dirs.settings_override.enable()
|
||||
|
||||
return dirs
|
||||
|
||||
@ -42,6 +47,18 @@ def remove_dirs(dirs):
|
||||
shutil.rmtree(dirs.data_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.scratch_dir, ignore_errors=True)
|
||||
shutil.rmtree(dirs.consumption_dir, ignore_errors=True)
|
||||
dirs.settings_override.disable()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def paperless_environment():
|
||||
dirs = None
|
||||
try:
|
||||
dirs = setup_directories()
|
||||
yield dirs
|
||||
finally:
|
||||
if dirs:
|
||||
remove_dirs(dirs)
|
||||
|
||||
|
||||
class DirectoriesMixin:
|
||||
|
@ -1,8 +1,16 @@
|
||||
import os
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from time import mktime
|
||||
|
||||
from django.conf import settings
|
||||
from django.db.models import Count, Max
|
||||
from django.http import HttpResponse, HttpResponseBadRequest, Http404
|
||||
from django.views.decorators.cache import cache_control
|
||||
from django.views.generic import TemplateView
|
||||
from django_filters.rest_framework import DjangoFilterBackend
|
||||
from django_q.tasks import async_task
|
||||
from rest_framework import parsers
|
||||
from rest_framework.decorators import action
|
||||
from rest_framework.filters import OrderingFilter, SearchFilter
|
||||
from rest_framework.mixins import (
|
||||
@ -31,14 +39,14 @@ from .filters import (
|
||||
DocumentTypeFilterSet,
|
||||
LogFilterSet
|
||||
)
|
||||
from .forms import UploadForm
|
||||
from .models import Correspondent, Document, Log, Tag, DocumentType
|
||||
from .serialisers import (
|
||||
CorrespondentSerializer,
|
||||
DocumentSerializer,
|
||||
LogSerializer,
|
||||
TagSerializer,
|
||||
DocumentTypeSerializer
|
||||
DocumentTypeSerializer,
|
||||
PostDocumentSerializer
|
||||
)
|
||||
|
||||
|
||||
@ -131,29 +139,32 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
index.remove_document_from_index(self.get_object())
|
||||
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
|
||||
|
||||
def file_response(self, pk, disposition):
|
||||
@staticmethod
|
||||
def original_requested(request):
|
||||
return (
|
||||
'original' in request.query_params and
|
||||
request.query_params['original'] == 'true'
|
||||
)
|
||||
|
||||
def file_response(self, pk, request, disposition):
|
||||
doc = Document.objects.get(id=pk)
|
||||
|
||||
if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
|
||||
if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
|
||||
file_handle = doc.archive_file
|
||||
filename = doc.archive_file_name
|
||||
mime_type = 'application/pdf'
|
||||
else:
|
||||
file_handle = doc.source_file
|
||||
else:
|
||||
file_handle = GnuPG.decrypted(doc.source_file)
|
||||
filename = doc.file_name
|
||||
mime_type = doc.mime_type
|
||||
|
||||
response = HttpResponse(file_handle, content_type=doc.mime_type)
|
||||
if doc.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
file_handle = GnuPG.decrypted(file_handle)
|
||||
|
||||
response = HttpResponse(file_handle, content_type=mime_type)
|
||||
response["Content-Disposition"] = '{}; filename="{}"'.format(
|
||||
disposition, doc.file_name)
|
||||
disposition, filename)
|
||||
return response
|
||||
|
||||
@action(methods=['post'], detail=False)
|
||||
def post_document(self, request, pk=None):
|
||||
# TODO: is this a good implementation?
|
||||
form = UploadForm(data=request.POST, files=request.FILES)
|
||||
if form.is_valid():
|
||||
form.save()
|
||||
return Response("OK")
|
||||
else:
|
||||
return HttpResponseBadRequest(str(form.errors))
|
||||
|
||||
@action(methods=['post'], detail=False)
|
||||
def bulk_edit(self, request, pk=None):
|
||||
try:
|
||||
@ -169,6 +180,8 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
"paperless__checksum": doc.checksum,
|
||||
"paperless__mime_type": doc.mime_type,
|
||||
"paperless__filename": doc.filename,
|
||||
"paperless__has_archive_version":
|
||||
os.path.isfile(doc.archive_path)
|
||||
})
|
||||
except Document.DoesNotExist:
|
||||
raise Http404()
|
||||
@ -176,7 +189,8 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
@action(methods=['get'], detail=True)
|
||||
def preview(self, request, pk=None):
|
||||
try:
|
||||
response = self.file_response(pk, "inline")
|
||||
response = self.file_response(
|
||||
pk, request, "inline")
|
||||
return response
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
@ -193,7 +207,8 @@ class DocumentViewSet(RetrieveModelMixin,
|
||||
@action(methods=['get'], detail=True)
|
||||
def download(self, request, pk=None):
|
||||
try:
|
||||
return self.file_response(pk, "attachment")
|
||||
return self.file_response(
|
||||
pk, request, "attachment")
|
||||
except (FileNotFoundError, Document.DoesNotExist):
|
||||
raise Http404()
|
||||
|
||||
@ -210,6 +225,56 @@ class LogViewSet(ReadOnlyModelViewSet):
|
||||
ordering_fields = ("created",)
|
||||
|
||||
|
||||
class PostDocumentView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
serializer_class = PostDocumentSerializer
|
||||
parser_classes = (parsers.MultiPartParser,)
|
||||
|
||||
def get_serializer_context(self):
|
||||
return {
|
||||
'request': self.request,
|
||||
'format': self.format_kwarg,
|
||||
'view': self
|
||||
}
|
||||
|
||||
def get_serializer(self, *args, **kwargs):
|
||||
kwargs['context'] = self.get_serializer_context()
|
||||
return self.serializer_class(*args, **kwargs)
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
|
||||
serializer = self.get_serializer(data=request.data)
|
||||
serializer.is_valid(raise_exception=True)
|
||||
|
||||
document = serializer.validated_data['document']
|
||||
document_data = serializer.validated_data['document_data']
|
||||
correspondent_id = serializer.validated_data['correspondent_id']
|
||||
document_type_id = serializer.validated_data['document_type_id']
|
||||
tag_ids = serializer.validated_data['tag_ids']
|
||||
title = serializer.validated_data['title']
|
||||
|
||||
t = int(mktime(datetime.now().timetuple()))
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False) as f:
|
||||
f.write(document_data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
|
||||
async_task("documents.tasks.consume_file",
|
||||
f.name,
|
||||
override_filename=document.name,
|
||||
override_title=title,
|
||||
override_correspondent_id=correspondent_id,
|
||||
override_document_type_id=document_type_id,
|
||||
override_tag_ids=tag_ids,
|
||||
task_name=os.path.basename(document.name)[:100])
|
||||
return Response("OK")
|
||||
|
||||
|
||||
class SearchView(APIView):
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
@ -229,30 +294,34 @@ class SearchView(APIView):
|
||||
}
|
||||
|
||||
def get(self, request, format=None):
|
||||
if 'query' in request.query_params:
|
||||
query = request.query_params['query']
|
||||
try:
|
||||
page = int(request.query_params.get('page', 1))
|
||||
except (ValueError, TypeError):
|
||||
page = 1
|
||||
|
||||
if page < 1:
|
||||
page = 1
|
||||
|
||||
with index.query_page(self.ix, query, page) as result_page:
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
'page_count': result_page.pagecount,
|
||||
'results': list(map(self.add_infos_to_hit, result_page))})
|
||||
|
||||
else:
|
||||
if 'query' not in request.query_params:
|
||||
return Response({
|
||||
'count': 0,
|
||||
'page': 0,
|
||||
'page_count': 0,
|
||||
'results': []})
|
||||
|
||||
query = request.query_params['query']
|
||||
try:
|
||||
page = int(request.query_params.get('page', 1))
|
||||
except (ValueError, TypeError):
|
||||
page = 1
|
||||
|
||||
if page < 1:
|
||||
page = 1
|
||||
|
||||
try:
|
||||
with index.query_page(self.ix, query, page) as (result_page,
|
||||
corrected_query):
|
||||
return Response(
|
||||
{'count': len(result_page),
|
||||
'page': result_page.pagenum,
|
||||
'page_count': result_page.pagecount,
|
||||
'corrected_query': corrected_query,
|
||||
'results': list(map(self.add_infos_to_hit, result_page))})
|
||||
except Exception as e:
|
||||
return HttpResponseBadRequest(str(e))
|
||||
|
||||
|
||||
class SearchAutoCompleteView(APIView):
|
||||
|
||||
|
@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs):
|
||||
binaries = (
|
||||
settings.CONVERT_BINARY,
|
||||
settings.OPTIPNG_BINARY,
|
||||
settings.UNPAPER_BINARY,
|
||||
"tesseract"
|
||||
)
|
||||
|
||||
|
@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta
|
||||
|
||||
MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media"))
|
||||
ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals")
|
||||
ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
|
||||
THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
|
||||
|
||||
DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data"))
|
||||
@ -85,6 +86,7 @@ INSTALLED_APPS = [
|
||||
"django.contrib.admin",
|
||||
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
"django_filters",
|
||||
|
||||
"django_q",
|
||||
@ -94,7 +96,8 @@ INSTALLED_APPS = [
|
||||
REST_FRAMEWORK = {
|
||||
'DEFAULT_AUTHENTICATION_CLASSES': [
|
||||
'rest_framework.authentication.BasicAuthentication',
|
||||
'rest_framework.authentication.SessionAuthentication'
|
||||
'rest_framework.authentication.SessionAuthentication',
|
||||
'rest_framework.authentication.TokenAuthentication'
|
||||
]
|
||||
}
|
||||
|
||||
@ -255,26 +258,43 @@ DISABLE_DBHANDLER = __get_boolean("PAPERLESS_DISABLE_DBHANDLER")
|
||||
LOGGING = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
'formatters': {
|
||||
'verbose': {
|
||||
'format': '{levelname} {asctime} {module} {message}',
|
||||
'style': '{',
|
||||
},
|
||||
'simple': {
|
||||
'format': '{levelname} {message}',
|
||||
'style': '{',
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"dbhandler": {
|
||||
"db": {
|
||||
"level": "DEBUG",
|
||||
"class": "documents.loggers.PaperlessHandler",
|
||||
},
|
||||
"streamhandler": {
|
||||
"class": "logging.StreamHandler"
|
||||
"console": {
|
||||
"level": "INFO",
|
||||
"class": "logging.StreamHandler",
|
||||
"formatter": "verbose",
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"handlers": ["console"],
|
||||
"level": "DEBUG",
|
||||
},
|
||||
"loggers": {
|
||||
"documents": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
"handlers": ["db"],
|
||||
"propagate": True,
|
||||
},
|
||||
"paperless_mail": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
"handlers": ["db"],
|
||||
"propagate": True,
|
||||
},
|
||||
"paperless_tesseract": {
|
||||
"handlers": ["dbhandler", "streamhandler"],
|
||||
"level": "DEBUG"
|
||||
"handlers": ["db"],
|
||||
"propagate": True,
|
||||
},
|
||||
},
|
||||
}
|
||||
@ -331,6 +351,10 @@ CONSUMER_POLLING = int(os.getenv("PAPERLESS_CONSUMER_POLLING", 0))
|
||||
|
||||
CONSUMER_DELETE_DUPLICATES = __get_boolean("PAPERLESS_CONSUMER_DELETE_DUPLICATES")
|
||||
|
||||
CONSUMER_RECURSIVE = __get_boolean("PAPERLESS_CONSUMER_RECURSIVE")
|
||||
|
||||
CONSUMER_SUBDIRS_AS_TAGS = __get_boolean("PAPERLESS_CONSUMER_SUBDIRS_AS_TAGS")
|
||||
|
||||
OPTIMIZE_THUMBNAILS = __get_boolean("PAPERLESS_OPTIMIZE_THUMBNAILS", "true")
|
||||
|
||||
OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
|
||||
@ -339,9 +363,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
|
||||
|
||||
# OCRmyPDF --output-type options are available.
|
||||
# TODO: validate this setting.
|
||||
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
|
||||
|
||||
# OCR all documents?
|
||||
OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
|
||||
# skip. redo, force
|
||||
# TODO: validate this.
|
||||
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||
|
||||
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
|
||||
|
||||
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
|
||||
|
||||
# GNUPG needs a home directory for some reason
|
||||
GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||
@ -350,11 +382,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||
CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
|
||||
CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
|
||||
CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
|
||||
CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
|
||||
|
||||
GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
|
||||
|
||||
OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
|
||||
UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
|
||||
|
||||
|
||||
# Pre-2.x versions of Paperless stored your documents locally with GPG
|
||||
|
@ -4,6 +4,7 @@ from django.contrib.auth.decorators import login_required
|
||||
from django.urls import path, re_path
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.views.generic import RedirectView
|
||||
from rest_framework.authtoken import views
|
||||
from rest_framework.routers import DefaultRouter
|
||||
|
||||
from documents.views import (
|
||||
@ -15,7 +16,8 @@ from documents.views import (
|
||||
SearchView,
|
||||
IndexView,
|
||||
SearchAutoCompleteView,
|
||||
StatisticsView
|
||||
StatisticsView,
|
||||
PostDocumentView
|
||||
)
|
||||
from paperless.views import FaviconView
|
||||
|
||||
@ -45,6 +47,11 @@ urlpatterns = [
|
||||
StatisticsView.as_view(),
|
||||
name="statistics"),
|
||||
|
||||
re_path(r"^documents/post_document/", PostDocumentView.as_view(),
|
||||
name="post_document"),
|
||||
|
||||
path('token/', views.obtain_auth_token)
|
||||
|
||||
] + api_router.urls)),
|
||||
|
||||
re_path(r"^favicon.ico$", FaviconView.as_view(), name="favicon"),
|
||||
|
@ -1 +1 @@
|
||||
__version__ = (0, 9, 3)
|
||||
__version__ = (0, 9, 5)
|
||||
|
@ -4,6 +4,7 @@ from datetime import timedelta, date
|
||||
|
||||
import magic
|
||||
from django.conf import settings
|
||||
from django.db import DatabaseError
|
||||
from django.utils.text import slugify
|
||||
from django_q.tasks import async_task
|
||||
from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \
|
||||
@ -86,46 +87,6 @@ def make_criterias(rule):
|
||||
return {**criterias, **get_rule_action(rule).get_criteria()}
|
||||
|
||||
|
||||
def get_title(message, att, rule):
|
||||
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
||||
title = message.subject
|
||||
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
|
||||
title = os.path.splitext(os.path.basename(att.filename))[0]
|
||||
else:
|
||||
raise ValueError("Unknown title selector.")
|
||||
|
||||
return title
|
||||
|
||||
|
||||
def get_correspondent(message, rule):
|
||||
if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING:
|
||||
correspondent = None
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL:
|
||||
correspondent_name = message.from_
|
||||
correspondent = Correspondent.objects.get_or_create(
|
||||
name=correspondent_name, defaults={
|
||||
"slug": slugify(correspondent_name)
|
||||
})[0]
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME:
|
||||
if message.from_values and \
|
||||
'name' in message.from_values \
|
||||
and message.from_values['name']:
|
||||
correspondent_name = message.from_values['name']
|
||||
else:
|
||||
correspondent_name = message.from_
|
||||
|
||||
correspondent = Correspondent.objects.get_or_create(
|
||||
name=correspondent_name, defaults={
|
||||
"slug": slugify(correspondent_name)
|
||||
})[0]
|
||||
elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
|
||||
correspondent = rule.assign_correspondent
|
||||
else:
|
||||
raise ValueError("Unknwown correspondent selector")
|
||||
|
||||
return correspondent
|
||||
|
||||
|
||||
def get_mailbox(server, port, security):
|
||||
if security == MailAccount.IMAP_SECURITY_NONE:
|
||||
mailbox = MailBoxUnencrypted(server, port)
|
||||
@ -140,6 +101,51 @@ def get_mailbox(server, port, security):
|
||||
|
||||
class MailAccountHandler(LoggingMixin):
|
||||
|
||||
def _correspondent_from_name(self, name):
|
||||
try:
|
||||
return Correspondent.objects.get_or_create(
|
||||
name=name, defaults={
|
||||
"slug": slugify(name)
|
||||
})[0]
|
||||
except DatabaseError as e:
|
||||
self.log(
|
||||
"error",
|
||||
f"Error while retrieving correspondent {name}: {e}"
|
||||
)
|
||||
return None
|
||||
|
||||
def get_title(self, message, att, rule):
|
||||
if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT:
|
||||
return message.subject
|
||||
|
||||
elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME:
|
||||
return os.path.splitext(os.path.basename(att.filename))[0]
|
||||
|
||||
else:
|
||||
raise ValueError("Unknown title selector.")
|
||||
|
||||
def get_correspondent(self, message, rule):
|
||||
c_from = rule.assign_correspondent_from
|
||||
|
||||
if c_from == MailRule.CORRESPONDENT_FROM_NOTHING:
|
||||
return None
|
||||
|
||||
elif c_from == MailRule.CORRESPONDENT_FROM_EMAIL:
|
||||
return self._correspondent_from_name(message.from_)
|
||||
|
||||
elif c_from == MailRule.CORRESPONDENT_FROM_NAME:
|
||||
if message.from_values and 'name' in message.from_values and message.from_values['name']: # NOQA: E501
|
||||
return self._correspondent_from_name(
|
||||
message.from_values['name'])
|
||||
else:
|
||||
return self._correspondent_from_name(message.from_)
|
||||
|
||||
elif c_from == MailRule.CORRESPONDENT_FROM_CUSTOM:
|
||||
return rule.assign_correspondent
|
||||
|
||||
else:
|
||||
raise ValueError("Unknwown correspondent selector")
|
||||
|
||||
def handle_mail_account(self, account):
|
||||
|
||||
self.renew_logging_group()
|
||||
@ -156,79 +162,89 @@ class MailAccountHandler(LoggingMixin):
|
||||
M.login(account.username, account.password)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Error while authenticating account {account.name}")
|
||||
f"Error while authenticating account {account}")
|
||||
|
||||
self.log('debug', f"Account {account}: Processing "
|
||||
f"{account.rules.count()} rule(s)")
|
||||
|
||||
for rule in account.rules.order_by('order'):
|
||||
self.log(
|
||||
'debug',
|
||||
f"Account {account}: Processing rule {rule.name}")
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Selecting folder {rule.folder}")
|
||||
|
||||
try:
|
||||
M.folder.set(rule.folder)
|
||||
except MailboxFolderSelectError:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Folder {rule.folder} "
|
||||
f"does not exist in account {account.name}")
|
||||
total_processed_files += self.handle_mail_rule(M, rule)
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"error",
|
||||
f"Rule {rule}: Error while processing rule: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
criterias = make_criterias(rule)
|
||||
return total_processed_files
|
||||
|
||||
def handle_mail_rule(self, M, rule):
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule}: Selecting folder {rule.folder}")
|
||||
|
||||
try:
|
||||
M.folder.set(rule.folder)
|
||||
except MailboxFolderSelectError:
|
||||
raise MailError(
|
||||
f"Rule {rule}: Folder {rule.folder} "
|
||||
f"does not exist in account {rule.account}")
|
||||
|
||||
criterias = make_criterias(rule)
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule}: Searching folder with criteria "
|
||||
f"{str(AND(**criterias))}")
|
||||
|
||||
try:
|
||||
messages = M.fetch(criteria=AND(**criterias),
|
||||
mark_seen=False)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule}: Error while fetching folder {rule.folder}")
|
||||
|
||||
post_consume_messages = []
|
||||
|
||||
mails_processed = 0
|
||||
total_processed_files = 0
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
processed_files = self.handle_message(message, rule)
|
||||
if processed_files > 0:
|
||||
post_consume_messages.append(message.uid)
|
||||
|
||||
total_processed_files += processed_files
|
||||
mails_processed += 1
|
||||
except Exception as e:
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Searching folder with criteria "
|
||||
f"{str(AND(**criterias))}")
|
||||
"error",
|
||||
f"Rule {rule}: Error while processing mail "
|
||||
f"{message.uid}: {e}",
|
||||
exc_info=True)
|
||||
|
||||
try:
|
||||
messages = M.fetch(criteria=AND(**criterias),
|
||||
mark_seen=False)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while fetching folder "
|
||||
f"{rule.folder} of account {account.name}")
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule}: Processed {mails_processed} matching mail(s)")
|
||||
|
||||
post_consume_messages = []
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule}: Running mail actions on "
|
||||
f"{len(post_consume_messages)} mails")
|
||||
|
||||
mails_processed = 0
|
||||
try:
|
||||
get_rule_action(rule).post_consume(
|
||||
M,
|
||||
post_consume_messages,
|
||||
rule.action_parameter)
|
||||
|
||||
for message in messages:
|
||||
try:
|
||||
processed_files = self.handle_message(message, rule)
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing mail "
|
||||
f"{message.uid} of account {account.name}")
|
||||
if processed_files > 0:
|
||||
post_consume_messages.append(message.uid)
|
||||
|
||||
total_processed_files += processed_files
|
||||
mails_processed += 1
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Processed {mails_processed} "
|
||||
f"matching mail(s)")
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {account}.{rule}: Running mail actions on "
|
||||
f"{len(post_consume_messages)} mails")
|
||||
|
||||
try:
|
||||
get_rule_action(rule).post_consume(
|
||||
M,
|
||||
post_consume_messages,
|
||||
rule.action_parameter)
|
||||
|
||||
except Exception:
|
||||
raise MailError(
|
||||
f"Rule {rule.name}: Error while processing "
|
||||
f"post-consume actions for account {account.name}")
|
||||
except Exception as e:
|
||||
raise MailError(
|
||||
f"Rule {rule}: Error while processing post-consume actions: "
|
||||
f"{e}")
|
||||
|
||||
return total_processed_files
|
||||
|
||||
@ -238,11 +254,11 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Rule {rule}: "
|
||||
f"Processing mail {message.subject} from {message.from_} with "
|
||||
f"{len(message.attachments)} attachment(s)")
|
||||
|
||||
correspondent = get_correspondent(message, rule)
|
||||
correspondent = self.get_correspondent(message, rule)
|
||||
tag = rule.assign_tag
|
||||
doc_type = rule.assign_document_type
|
||||
|
||||
@ -253,12 +269,12 @@ class MailAccountHandler(LoggingMixin):
|
||||
if not att.content_disposition == "attachment":
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Rule {rule}: "
|
||||
f"Skipping attachment {att.filename} "
|
||||
f"with content disposition inline")
|
||||
f"with content disposition {att.content_disposition}")
|
||||
continue
|
||||
|
||||
title = get_title(message, att, rule)
|
||||
title = self.get_title(message, att, rule)
|
||||
|
||||
# don't trust the content type of the attachment. Could be
|
||||
# generic application/octet-stream.
|
||||
@ -274,7 +290,7 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
self.log(
|
||||
'info',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Rule {rule}: "
|
||||
f"Consuming attachment {att.filename} from mail "
|
||||
f"{message.subject} from {message.from_}")
|
||||
|
||||
@ -293,7 +309,7 @@ class MailAccountHandler(LoggingMixin):
|
||||
else:
|
||||
self.log(
|
||||
'debug',
|
||||
f"Rule {rule.account}.{rule}: "
|
||||
f"Rule {rule}: "
|
||||
f"Skipping attachment {att.filename} "
|
||||
f"since guessed mime type {mime_type} is not supported "
|
||||
f"by paperless")
|
||||
|
@ -139,4 +139,4 @@ class MailRule(models.Model):
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
return f"{self.account.name}.{self.name}"
|
||||
|
@ -1,14 +1,20 @@
|
||||
import logging
|
||||
|
||||
from paperless_mail.mail import MailAccountHandler
|
||||
from paperless_mail.mail import MailAccountHandler, MailError
|
||||
from paperless_mail.models import MailAccount
|
||||
|
||||
|
||||
def process_mail_accounts():
|
||||
total_new_documents = 0
|
||||
for account in MailAccount.objects.all():
|
||||
total_new_documents += MailAccountHandler().handle_mail_account(
|
||||
account)
|
||||
try:
|
||||
total_new_documents += MailAccountHandler().handle_mail_account(
|
||||
account)
|
||||
except MailError as e:
|
||||
logging.getLogger(__name__).error(
|
||||
f"Error while processing mail account {account}: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
if total_new_documents > 0:
|
||||
return f"Added {total_new_documents} document(s)."
|
||||
@ -17,8 +23,8 @@ def process_mail_accounts():
|
||||
|
||||
|
||||
def process_mail_account(name):
|
||||
account = MailAccount.objects.find(name=name)
|
||||
if account:
|
||||
try:
|
||||
account = MailAccount.objects.get(name=name)
|
||||
MailAccountHandler().handle_mail_account(account)
|
||||
else:
|
||||
logging.error("Unknown mail acccount: {}".format(name))
|
||||
except MailAccount.DoesNotExist:
|
||||
logging.getLogger(__name__).error(f"Unknown mail acccount: {name}")
|
||||
|
@ -3,11 +3,14 @@ from collections import namedtuple
|
||||
from typing import ContextManager
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.db import DatabaseError
|
||||
from django.test import TestCase
|
||||
from imap_tools import MailMessageFlags, MailboxFolderSelectError
|
||||
|
||||
from documents.models import Correspondent
|
||||
from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title
|
||||
from paperless_mail import tasks
|
||||
from paperless_mail.mail import MailError, MailAccountHandler
|
||||
from paperless_mail.models import MailRule, MailAccount
|
||||
|
||||
|
||||
@ -163,28 +166,30 @@ class TestMail(TestCase):
|
||||
me_localhost = Correspondent.objects.create(name=message2.from_)
|
||||
someone_else = Correspondent.objects.create(name="someone else")
|
||||
|
||||
handler = MailAccountHandler()
|
||||
|
||||
rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING)
|
||||
self.assertIsNone(get_correspondent(message, rule))
|
||||
self.assertIsNone(handler.get_correspondent(message, rule))
|
||||
|
||||
rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
|
||||
c = get_correspondent(message, rule)
|
||||
c = handler.get_correspondent(message, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "someone@somewhere.com")
|
||||
c = get_correspondent(message2, rule)
|
||||
c = handler.get_correspondent(message2, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "me@localhost.com")
|
||||
self.assertEqual(c.id, me_localhost.id)
|
||||
|
||||
rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME)
|
||||
c = get_correspondent(message, rule)
|
||||
c = handler.get_correspondent(message, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.name, "Someone!")
|
||||
c = get_correspondent(message2, rule)
|
||||
c = handler.get_correspondent(message2, rule)
|
||||
self.assertIsNotNone(c)
|
||||
self.assertEqual(c.id, me_localhost.id)
|
||||
|
||||
rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else)
|
||||
c = get_correspondent(message, rule)
|
||||
c = handler.get_correspondent(message, rule)
|
||||
self.assertEqual(c, someone_else)
|
||||
|
||||
def test_get_title(self):
|
||||
@ -192,10 +197,13 @@ class TestMail(TestCase):
|
||||
message.subject = "the message title"
|
||||
att = namedtuple('Attachment', [])
|
||||
att.filename = "this_is_the_file.pdf"
|
||||
|
||||
handler = MailAccountHandler()
|
||||
|
||||
rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME)
|
||||
self.assertEqual(get_title(message, att, rule), "this_is_the_file")
|
||||
self.assertEqual(handler.get_title(message, att, rule), "this_is_the_file")
|
||||
rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT)
|
||||
self.assertEqual(get_title(message, att, rule), "the message title")
|
||||
self.assertEqual(handler.get_title(message, att, rule), "the message title")
|
||||
|
||||
def test_handle_message(self):
|
||||
message = create_message(subject="the message title", from_="Myself", num_attachments=2)
|
||||
@ -317,7 +325,7 @@ class TestMail(TestCase):
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||
|
||||
def test_errors(self):
|
||||
def test_error_login(self):
|
||||
account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong")
|
||||
|
||||
try:
|
||||
@ -327,26 +335,84 @@ class TestMail(TestCase):
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
def test_error_skip_account(self):
|
||||
account_faulty = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wroasdng")
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE,
|
||||
action_parameter="spam", filter_subject="Claim")
|
||||
|
||||
tasks.process_mail_accounts()
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||
|
||||
def test_error_skip_rule(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE,
|
||||
action_parameter="spam", filter_subject="Claim", order=1, folder="uuuhhhh")
|
||||
rule2 = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE,
|
||||
action_parameter="spam", filter_subject="Claim", order=2)
|
||||
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 1)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages_spam), 1)
|
||||
|
||||
|
||||
@mock.patch("paperless_mail.mail.MailAccountHandler.get_correspondent")
|
||||
def test_error_skip_mail(self, m):
|
||||
|
||||
def get_correspondent_fake(message, rule):
|
||||
if message.from_ == 'amazon@amazon.de':
|
||||
raise ValueError("Does not compute.")
|
||||
else:
|
||||
return None
|
||||
|
||||
m.side_effect = get_correspondent_fake
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam")
|
||||
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
|
||||
# test that we still consume mail even if some mails throw errors.
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
|
||||
# faulty mail still in inbox, untouched
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 1)
|
||||
self.assertEqual(self.bogus_mailbox.messages[0].from_, 'amazon@amazon.de')
|
||||
|
||||
def test_error_create_correspondent(self):
|
||||
|
||||
account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret")
|
||||
rule = MailRule.objects.create(
|
||||
name="testrule", filter_from="amazon@amazon.de",
|
||||
account=account, action=MailRule.ACTION_MOVE, action_parameter="spam",
|
||||
assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL)
|
||||
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
|
||||
self.async_task.assert_called_once()
|
||||
args, kwargs = self.async_task.call_args
|
||||
|
||||
c = Correspondent.objects.get(name="amazon@amazon.de")
|
||||
# should work
|
||||
self.assertEquals(kwargs['override_correspondent_id'], c.id)
|
||||
|
||||
self.async_task.reset_mock()
|
||||
self.reset_bogus_mailbox()
|
||||
|
||||
with mock.patch("paperless_mail.mail.Correspondent.objects.get_or_create") as m:
|
||||
m.side_effect = DatabaseError()
|
||||
|
||||
try:
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue("uuuh does not exist" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret")
|
||||
args, kwargs = self.async_task.call_args
|
||||
self.async_task.assert_called_once()
|
||||
self.assertEquals(kwargs['override_correspondent_id'], None)
|
||||
|
||||
rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim")
|
||||
|
||||
try:
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
except MailError as e:
|
||||
self.assertTrue("Error while processing post-consume actions" in str(e))
|
||||
else:
|
||||
self.fail("Should raise exception")
|
||||
|
||||
def test_filters(self):
|
||||
|
||||
@ -390,3 +456,43 @@ class TestMail(TestCase):
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 2)
|
||||
self.assertEqual(self.async_task.call_count, 5)
|
||||
|
||||
class TestManagementCommand(TestCase):
|
||||
|
||||
@mock.patch("paperless_mail.management.commands.mail_fetcher.tasks.process_mail_accounts")
|
||||
def test_mail_fetcher(self, m):
|
||||
|
||||
call_command("mail_fetcher")
|
||||
|
||||
m.assert_called_once()
|
||||
|
||||
class TestTasks(TestCase):
|
||||
|
||||
@mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
|
||||
def test_all_accounts(self, m):
|
||||
m.side_effect = lambda account: 6
|
||||
|
||||
MailAccount.objects.create(name="A", imap_server="A", username="A", password="A")
|
||||
MailAccount.objects.create(name="B", imap_server="A", username="A", password="A")
|
||||
|
||||
result = tasks.process_mail_accounts()
|
||||
|
||||
self.assertEqual(m.call_count, 2)
|
||||
self.assertIn("Added 12", result)
|
||||
|
||||
m.side_effect = lambda account: 0
|
||||
result = tasks.process_mail_accounts()
|
||||
self.assertIn("No new", result)
|
||||
|
||||
@mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
|
||||
def test_single_accounts(self, m):
|
||||
|
||||
MailAccount.objects.create(name="A", imap_server="A", username="A", password="A")
|
||||
|
||||
tasks.process_mail_account("A")
|
||||
|
||||
m.assert_called_once()
|
||||
m.reset_mock()
|
||||
|
||||
tasks.process_mail_account("B")
|
||||
m.assert_not_called()
|
||||
|
@ -14,12 +14,21 @@ def get_tesseract_langs():
|
||||
|
||||
@register()
|
||||
def check_default_language_available(app_configs, **kwargs):
|
||||
langs = get_tesseract_langs()
|
||||
installed_langs = get_tesseract_langs()
|
||||
|
||||
if settings.OCR_LANGUAGE not in langs:
|
||||
return [Error(
|
||||
f"The default ocr language {settings.OCR_LANGUAGE} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
|
||||
else:
|
||||
return []
|
||||
if not settings.OCR_LANGUAGE:
|
||||
return [Warning(
|
||||
"No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
|
||||
"This means that tesseract will fallback to english."
|
||||
)]
|
||||
|
||||
specified_langs = settings.OCR_LANGUAGE.split("+")
|
||||
|
||||
for lang in specified_langs:
|
||||
if lang not in installed_langs:
|
||||
return [Error(
|
||||
f"The selected ocr language {lang} is "
|
||||
f"not installed. Paperless cannot OCR your documents "
|
||||
f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
|
||||
|
||||
return []
|
||||
|
@ -1,23 +1,15 @@
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
import langdetect
|
||||
import ocrmypdf
|
||||
import pdftotext
|
||||
import pyocr
|
||||
from PIL import Image
|
||||
from django.conf import settings
|
||||
from pyocr import PyocrException
|
||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||
|
||||
from documents.parsers import DocumentParser, ParseError, run_unpaper, \
|
||||
run_convert
|
||||
from .languages import ISO639
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
pass
|
||||
from documents.parsers import DocumentParser, ParseError, run_convert
|
||||
|
||||
|
||||
class RasterisedDocumentParser(DocumentParser):
|
||||
@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
|
||||
"""
|
||||
|
||||
def __init__(self, path, logging_group):
|
||||
super().__init__(path, logging_group)
|
||||
self._text = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
def get_thumbnail(self, document_path, mime_type):
|
||||
"""
|
||||
The thumbnail of a PDF is just a 500px wide image of the first page.
|
||||
"""
|
||||
@ -43,8 +31,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=True,
|
||||
input_file="{}[0]".format(self.document_path),
|
||||
trim=False,
|
||||
input_file="{}[0]".format(document_path),
|
||||
output_file=out_path,
|
||||
logging_group=self.logging_group)
|
||||
except ParseError:
|
||||
@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"-q",
|
||||
"-sDEVICE=pngalpha",
|
||||
"-o", gs_out_path,
|
||||
self.document_path]
|
||||
document_path]
|
||||
if not subprocess.Popen(cmd).wait() == 0:
|
||||
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
|
||||
# then run convert on the output from gs
|
||||
@ -67,176 +55,160 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=True,
|
||||
trim=False,
|
||||
input_file=gs_out_path,
|
||||
output_file=out_path,
|
||||
logging_group=self.logging_group)
|
||||
|
||||
return out_path
|
||||
|
||||
def _is_ocred(self):
|
||||
|
||||
# Extract text from PDF using pdftotext
|
||||
text = get_text_from_pdf(self.document_path)
|
||||
|
||||
# We assume, that a PDF with at least 50 characters contains text
|
||||
# (so no OCR required)
|
||||
return len(text) > 50
|
||||
|
||||
def get_text(self):
|
||||
|
||||
if self._text is not None:
|
||||
return self._text
|
||||
|
||||
if not settings.OCR_ALWAYS and self._is_ocred():
|
||||
self.log("debug", "Skipping OCR, using Text from PDF")
|
||||
self._text = get_text_from_pdf(self.document_path)
|
||||
return self._text
|
||||
|
||||
images = self._get_greyscale()
|
||||
|
||||
if not images:
|
||||
raise ParseError("Empty document, nothing to do.")
|
||||
def is_image(self, mime_type):
|
||||
return mime_type in [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/bmp",
|
||||
"image/gif",
|
||||
]
|
||||
|
||||
def get_dpi(self, image):
|
||||
try:
|
||||
|
||||
sample_page_index = int(len(images) / 2)
|
||||
self.log(
|
||||
"debug",
|
||||
f"Attempting language detection on page "
|
||||
f"{sample_page_index + 1} of {len(images)}...")
|
||||
|
||||
sample_page_text = self._ocr([images[sample_page_index]],
|
||||
settings.OCR_LANGUAGE)[0]
|
||||
guessed_language = self._guess_language(sample_page_text)
|
||||
|
||||
if not guessed_language or guessed_language not in ISO639:
|
||||
self.log("warning", "Language detection failed.")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
|
||||
self.log(
|
||||
"debug",
|
||||
f"Detected language: {guessed_language} "
|
||||
f"(default language)")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501
|
||||
self.log(
|
||||
"warning",
|
||||
f"Detected language {guessed_language} is not available "
|
||||
f"on this system.")
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
else:
|
||||
self.log("debug", f"Detected language: {guessed_language}")
|
||||
ocr_pages = self._ocr(images, ISO639[guessed_language])
|
||||
|
||||
self.log("debug", "OCR completed.")
|
||||
self._text = strip_excess_whitespace(" ".join(ocr_pages))
|
||||
return self._text
|
||||
|
||||
except OCRError as e:
|
||||
raise ParseError(e)
|
||||
|
||||
def _get_greyscale(self):
|
||||
"""
|
||||
Greyscale images are easier for Tesseract to OCR
|
||||
"""
|
||||
|
||||
# Convert PDF to multiple PNMs
|
||||
input_file = self.document_path
|
||||
|
||||
if settings.OCR_PAGES == 1:
|
||||
input_file += "[0]"
|
||||
elif settings.OCR_PAGES > 1:
|
||||
input_file += f"[0-{settings.OCR_PAGES - 1}]"
|
||||
|
||||
self.log(
|
||||
"debug",
|
||||
f"Converting document {input_file} into greyscale images")
|
||||
|
||||
output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
|
||||
|
||||
run_convert(density=settings.CONVERT_DENSITY,
|
||||
depth="8",
|
||||
type="grayscale",
|
||||
input_file=input_file,
|
||||
output_file=output_files,
|
||||
logging_group=self.logging_group)
|
||||
|
||||
# Get a list of converted images
|
||||
pnms = []
|
||||
for f in os.listdir(self.tempdir):
|
||||
if f.endswith(".pnm"):
|
||||
pnms.append(os.path.join(self.tempdir, f))
|
||||
|
||||
self.log("debug", f"Running unpaper on {len(pnms)} pages...")
|
||||
|
||||
# Run unpaper in parallel on converted images
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
pnms = pool.map(run_unpaper, pnms)
|
||||
|
||||
return sorted(filter(lambda __: os.path.isfile(__), pnms))
|
||||
|
||||
def _guess_language(self, text):
|
||||
try:
|
||||
guess = langdetect.detect(text)
|
||||
return guess
|
||||
with Image.open(image) as im:
|
||||
x, y = im.info['dpi']
|
||||
return x
|
||||
except Exception as e:
|
||||
self.log('warning', f"Language detection failed with: {e}")
|
||||
self.log(
|
||||
'warning',
|
||||
f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def _ocr(self, imgs, lang):
|
||||
self.log(
|
||||
"debug",
|
||||
f"Performing OCR on {len(imgs)} page(s) with language {lang}")
|
||||
with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
|
||||
r = pool.map(image_to_string, itertools.product(imgs, [lang]))
|
||||
return r
|
||||
def parse(self, document_path, mime_type):
|
||||
mode = settings.OCR_MODE
|
||||
|
||||
def _complete_ocr_default_language(self,
|
||||
images,
|
||||
sample_page_index,
|
||||
sample_page):
|
||||
images_copy = list(images)
|
||||
del images_copy[sample_page_index]
|
||||
if images_copy:
|
||||
self.log('debug', "Continuing ocr with default language.")
|
||||
ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
|
||||
ocr_pages.insert(sample_page_index, sample_page)
|
||||
return ocr_pages
|
||||
text_original = get_text_from_pdf(document_path)
|
||||
has_text = text_original and len(text_original) > 50
|
||||
|
||||
if mode == "skip_noarchive" and has_text:
|
||||
self.log("debug",
|
||||
"Document has text, skipping OCRmyPDF entirely.")
|
||||
self.text = text_original
|
||||
return
|
||||
|
||||
if mode in ['skip', 'skip_noarchive'] and not has_text:
|
||||
# upgrade to redo, since there appears to be no text in the
|
||||
# document. This happens to some weird encrypted documents or
|
||||
# documents with failed OCR attempts for which OCRmyPDF will
|
||||
# still report that there actually is text in them.
|
||||
self.log("debug",
|
||||
"No text was found in the document and skip is "
|
||||
"specified. Upgrading OCR mode to redo.")
|
||||
mode = "redo"
|
||||
|
||||
archive_path = os.path.join(self.tempdir, "archive.pdf")
|
||||
|
||||
ocr_args = {
|
||||
'input_file': document_path,
|
||||
'output_file': archive_path,
|
||||
'use_threads': True,
|
||||
'jobs': settings.THREADS_PER_WORKER,
|
||||
'language': settings.OCR_LANGUAGE,
|
||||
'output_type': settings.OCR_OUTPUT_TYPE,
|
||||
'progress_bar': False,
|
||||
'clean': True
|
||||
}
|
||||
|
||||
if settings.OCR_PAGES > 0:
|
||||
ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
|
||||
|
||||
# Mode selection.
|
||||
|
||||
if mode in ['skip', 'skip_noarchive']:
|
||||
ocr_args['skip_text'] = True
|
||||
elif mode == 'redo':
|
||||
ocr_args['redo_ocr'] = True
|
||||
elif mode == 'force':
|
||||
ocr_args['force_ocr'] = True
|
||||
else:
|
||||
return [sample_page]
|
||||
raise ParseError(
|
||||
f"Invalid ocr mode: {mode}")
|
||||
|
||||
if self.is_image(mime_type):
|
||||
dpi = self.get_dpi(document_path)
|
||||
if dpi:
|
||||
self.log(
|
||||
"debug",
|
||||
f"Detected DPI for image {document_path}: {dpi}"
|
||||
)
|
||||
ocr_args['image_dpi'] = dpi
|
||||
elif settings.OCR_IMAGE_DPI:
|
||||
ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
|
||||
else:
|
||||
raise ParseError(
|
||||
f"Cannot produce archive PDF for image {document_path}, "
|
||||
f"no DPI information is present in this image and "
|
||||
f"OCR_IMAGE_DPI is not set.")
|
||||
|
||||
if settings.OCR_USER_ARGS:
|
||||
try:
|
||||
user_args = json.loads(settings.OCR_USER_ARGS)
|
||||
ocr_args = {**ocr_args, **user_args}
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||
f"they will not be used: {e}")
|
||||
|
||||
# This forces tesseract to use one core per page.
|
||||
os.environ['OMP_THREAD_LIMIT'] = "1"
|
||||
|
||||
try:
|
||||
self.log("debug",
|
||||
f"Calling OCRmyPDF with {str(ocr_args)}")
|
||||
ocrmypdf.ocr(**ocr_args)
|
||||
# success! announce results
|
||||
self.archive_path = archive_path
|
||||
self.text = get_text_from_pdf(archive_path)
|
||||
|
||||
except (InputFileError, EncryptedPdfError) as e:
|
||||
|
||||
self.log("debug",
|
||||
f"Encountered an error: {e}. Trying to use text from "
|
||||
f"original.")
|
||||
# This happens with some PDFs when used with the redo_ocr option.
|
||||
# This is not the end of the world, we'll just use what we already
|
||||
# have in the document.
|
||||
self.text = text_original
|
||||
# Also, no archived file.
|
||||
if not self.text:
|
||||
# However, if we don't have anything, fail:
|
||||
raise ParseError(e)
|
||||
|
||||
except Exception as e:
|
||||
# Anything else is probably serious.
|
||||
raise ParseError(e)
|
||||
|
||||
if not self.text:
|
||||
# This may happen for files that don't have any text.
|
||||
self.log(
|
||||
'warning',
|
||||
f"Document {document_path} does not have any text."
|
||||
f"This is probably an error or you tried to add an image "
|
||||
f"without text, or something is wrong with this document.")
|
||||
self.text = ""
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
if not text:
|
||||
return None
|
||||
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub(
|
||||
r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub(
|
||||
r"([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
with Image.open(img) as f:
|
||||
if ocr.can_detect_orientation():
|
||||
try:
|
||||
orientation = ocr.detect_orientation(f, lang=lang)
|
||||
f = f.rotate(orientation["angle"], expand=1)
|
||||
except Exception:
|
||||
# Rotation not possible, ignore
|
||||
pass
|
||||
try:
|
||||
return ocr.image_to_string(f, lang=lang)
|
||||
except PyocrException as e:
|
||||
raise OCRError(e)
|
||||
# TODO: this needs a rework
|
||||
return no_trailing_whitespace.strip()
|
||||
|
||||
|
||||
def get_text_from_pdf(pdf_file):
|
||||
@ -245,6 +217,9 @@ def get_text_from_pdf(pdf_file):
|
||||
try:
|
||||
pdf = pdftotext.PDF(f)
|
||||
except pdftotext.Error:
|
||||
return ""
|
||||
# might not be a PDF file
|
||||
return None
|
||||
|
||||
return "\n".join(pdf)
|
||||
text = "\n".join(pdf)
|
||||
|
||||
return strip_excess_whitespace(text)
|
||||
|
@ -5,9 +5,12 @@ def tesseract_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0,
|
||||
"mime_types": [
|
||||
"application/pdf",
|
||||
"image/jpeg",
|
||||
"image/png"
|
||||
]
|
||||
"mime_types": {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/tiff": ".tif",
|
||||
"image/gif": ".gif",
|
||||
"image/bmp": ".bmp",
|
||||
}
|
||||
}
|
||||
|
BIN
src/paperless_tesseract/tests/samples/multi-page-digital.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/multi-page-digital.pdf
Normal file
Binary file not shown.
BIN
src/paperless_tesseract/tests/samples/multi-page-images.pdf
Normal file
BIN
src/paperless_tesseract/tests/samples/multi-page-images.pdf
Normal file
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user