Merge branch 'feature-ocrmypdf' into dev

2025-12-18 01:41:14 -06:00 · 2020-12-01 14:32:09 +01:00
parent ec6d01f7a5 f677ed8798
commit a33082235b
50 changed files with 1751 additions and 951 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -76,16 +76,11 @@ scripts/nuke
 /static/
 # Stored PDFs
-/media/documents/originals/*
+/media/
-/media/documents/thumbnails/*
+/data/
 /data/classification_model.pickle
 /data/db.sqlite3
 /data/index
 /paperless.conf
-/consume
+/consume/
-/export
+/export/
 /src-ui/.vscode
 # this is where the compiled frontend is moved to.
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,8 @@
 language: python
 dist: focal
 os: linux
 jobs:
  include:
    - name: "Paperless on Python 3.6"
@@ -33,7 +36,7 @@ jobs:
 before_install:
  - sudo apt-get update -qq
-  - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript
+  - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript optipng
 install:
  - pip install --upgrade pipenv
--- a/2
+++ b/2
@@ -26,7 +26,6 @@ langdetect = "*"
 pdftotext = "*"
 pathvalidate = "*"
 pillow = "*"
 pyocr = "~=0.7.2"
 python-gnupg = "*"
 python-dotenv = "*"
 python-dateutil = "*"
@@ -39,6 +38,7 @@ whitenoise = "~=5.2.0"
 watchdog = "*"
 whoosh="~=2.7.4"
 inotifyrecursive = ">=0.3.4"
 ocrmypdf = "*"
 [dev-packages]
 coveralls = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "d266e1f67e3090ec68aa8ecba1e8373351daf89ad5a5ab46524d123bcaf29f62"
+            "sha256": "55c9136777e78d6cd362628cd1fc0c5ff36b437699b92089ce504d598004371d"
        },
        "pipfile-spec": 6,
        "requires": {
@@ -44,6 +44,94 @@
            ],
            "version": "==1.17.12"
        },
        "cffi": {
            "hashes": [
                "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e",
                "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d",
                "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a",
                "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec",
                "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362",
                "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668",
                "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c",
                "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b",
                "sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654",
                "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06",
                "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698",
                "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2",
                "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c",
                "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7",
                "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009",
                "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03",
                "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b",
                "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909",
                "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53",
                "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35",
                "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26",
                "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b",
                "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb",
                "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293",
                "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd",
                "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d",
                "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3",
                "sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5",
                "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d",
                "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca",
                "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d",
                "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775",
                "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375",
                "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b",
                "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b",
                "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f"
            ],
            "version": "==1.14.4"
        },
        "chardet": {
            "hashes": [
                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
            ],
            "markers": "python_version >= '3.1'",
            "version": "==3.0.4"
        },
        "coloredlogs": {
            "hashes": [
                "sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a",
                "sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505",
                "sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
            "version": "==14.0"
        },
        "cryptography": {
            "hashes": [
                "sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538",
                "sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f",
                "sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d",
                "sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77",
                "sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b",
                "sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33",
                "sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e",
                "sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb",
                "sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e",
                "sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b",
                "sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7",
                "sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297",
                "sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d",
                "sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7",
                "sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b",
                "sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7",
                "sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4",
                "sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8",
                "sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b",
                "sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851",
                "sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13",
                "sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b",
                "sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3",
                "sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
            "version": "==3.2.1"
        },
        "dateparser": {
            "hashes": [
                "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b",
@@ -123,6 +211,14 @@
            "index": "pypi",
            "version": "==20.0.4"
        },
        "humanfriendly": {
            "hashes": [
                "sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12",
                "sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
            "version": "==8.2"
        },
        "imap-tools": {
            "hashes": [
                "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246",
@@ -131,6 +227,13 @@
            "index": "pypi",
            "version": "==0.32.0"
        },
        "img2pdf": {
            "hashes": [
                "sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5",
                "sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9"
            ],
            "version": "==0.4.0"
        },
        "inotify-simple": {
            "hashes": [
                "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128",
@@ -164,6 +267,51 @@
            "index": "pypi",
            "version": "==1.0.8"
        },
        "lxml": {
            "hashes": [
                "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
                "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
                "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
                "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
                "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
                "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
                "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
                "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
                "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
                "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
                "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
                "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
                "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
                "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
                "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
                "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
                "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
                "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
                "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
                "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
                "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
                "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
                "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
                "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
                "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
                "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
                "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
                "sha256:91d6dace31b07ab47eeadd3f4384ded2f77b94b30446410cb2c3e660e047f7a7",
                "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
                "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
                "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
                "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
                "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
                "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
                "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
                "sha256:e1dbb88a937126ab14d219a000728224702e0ec0fc7ceb7131c53606b7a76772",
                "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
                "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
                "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
            "version": "==4.6.2"
        },
        "numpy": {
            "hashes": [
                "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db",
@@ -205,6 +353,14 @@
            "markers": "python_version >= '3.6'",
            "version": "==1.19.4"
        },
        "ocrmypdf": {
            "hashes": [
                "sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3",
                "sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928"
            ],
            "index": "pypi",
            "version": "==11.3.3"
        },
        "pathtools": {
            "hashes": [
                "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0",
@@ -220,6 +376,14 @@
            "index": "pypi",
            "version": "==2.3.0"
        },
        "pdfminer.six": {
            "hashes": [
                "sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a",
                "sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509"
            ],
            "markers": "python_version >= '3.4'",
            "version": "==20201018"
        },
        "pdftotext": {
            "hashes": [
                "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6"
@@ -227,6 +391,33 @@
            "index": "pypi",
            "version": "==2.1.5"
        },
        "pikepdf": {
            "hashes": [
                "sha256:0829bd5dacd73bb4a37e7575bae523f49603479755563c92ddb55c206700cab1",
                "sha256:0d2b631077cd6af6e4d1b396208020705842610a6f13fab489d5f9c47916baa2",
                "sha256:21c98af08fae4ac9fbcad02b613b6768a4ca300fda4cba867f4a4b6f73c2d04b",
                "sha256:2240372fed30124ddc35b0c15a613f2b687a426ea2f150091e0a0c58cca7a495",
                "sha256:2a97f5f1403e058d217d7f6861cf51fca200c5687bce0d052f5f2fa89b5bfa22",
                "sha256:3faaefca0ae80d19891acec8b0dd5e6235f59f2206d82375eb80d090285e9557",
                "sha256:48ef45b64882901c0d69af3b85d16a19bd0f3e95b43e614fefb53521d8caf36c",
                "sha256:5212fe41f2323fc7356ba67caa39737fe13080562cff37bcbb74a8094076c8d0",
                "sha256:56859c32170663c57bd0658189ce44e180533eebe813853446cd6413810be9eb",
                "sha256:5f8fd1cb3478c5534222018aca24fbbd2bc74460c899bda988ec76722c13caa9",
                "sha256:74300a32c41b3d578772f6933f23a88b19f74484185e71e5225ce2f7ea5aea78",
                "sha256:8cbc946bdd217148f4a9c029fcea62f4ae0f67d5346de4c865f4718cd0ddc37f",
                "sha256:9ceefd30076f732530cf84a1be2ecb2fa9931af932706ded760a6d37c73b96ad",
                "sha256:ad69c170fda41b07a4c6b668a3128e7a759f50d9aebcfcde0ccff1358abe0423",
                "sha256:b715fe182189fb6870fab5b0383bb2fb278c88c46eade346b0f4c1ed8818c09d",
                "sha256:bb01ecf95083ffcb9ad542dc5342ccc1059e46f1395fd966629d36d9cc766b4a",
                "sha256:bd6328547219cf48cefb4e0a1bc54442910594de1c5a5feae847d9ff3c629031",
                "sha256:edb128379bb1dea76b5bdbdacf5657a6e4754bacc2049640762725590d8ed905",
                "sha256:f8e687900557fcd4c51b4e72b9e337fdae9e2c81049d1d80b624bb2e88b5769d",
                "sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
                "sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
            ],
            "markers": "python_version < '3.9'",
            "version": "==2.2.0"
        },
        "pillow": {
            "hashes": [
                "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a",
@@ -262,6 +453,14 @@
            "index": "pypi",
            "version": "==8.0.1"
        },
        "pluggy": {
            "hashes": [
                "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
                "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==0.13.1"
        },
        "psycopg2-binary": {
            "hashes": [
                "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c",
@@ -305,13 +504,13 @@
            "index": "pypi",
            "version": "==2.8.6"
        },
-        "pyocr": {
+        "pycparser": {
            "hashes": [
-                "sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179",
+                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
-                "sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301"
+                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
            ],
-            "index": "pypi",
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.7.2"
+            "version": "==2.20"
        },
        "python-dateutil": {
            "hashes": [
@@ -419,6 +618,53 @@
            ],
            "version": "==2020.11.13"
        },
        "reportlab": {
            "hashes": [
                "sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540",
                "sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c",
                "sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030",
                "sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f",
                "sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5",
                "sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e",
                "sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89",
                "sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2",
                "sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f",
                "sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969",
                "sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53",
                "sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d",
                "sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d",
                "sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457",
                "sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec",
                "sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f",
                "sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778",
                "sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92",
                "sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6",
                "sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077",
                "sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673",
                "sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b",
                "sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48",
                "sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95",
                "sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94",
                "sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589",
                "sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9",
                "sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8",
                "sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d",
                "sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32",
                "sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b",
                "sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a",
                "sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2",
                "sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967",
                "sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5",
                "sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437",
                "sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8",
                "sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab",
                "sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e",
                "sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a",
                "sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8",
                "sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08"
            ],
            "version": "==3.5.55"
        },
        "scikit-learn": {
            "hashes": [
                "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e",
@@ -482,6 +728,13 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==1.15.0"
        },
        "sortedcontainers": {
            "hashes": [
                "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f",
                "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1"
            ],
            "version": "==2.3.0"
        },
        "sqlparse": {
            "hashes": [
                "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
@@ -498,6 +751,14 @@
            "markers": "python_version >= '3.5'",
            "version": "==2.1.0"
        },
        "tqdm": {
            "hashes": [
                "sha256:5c0d04e06ccc0da1bd3fa5ae4550effcce42fcad947b4a6cafa77bdc9b09ff22",
                "sha256:9e7b8ab0ecbdbf0595adadd5f0ebbb9e69010e0bd48bbb0c15e550bf2a5292df"
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==4.54.0"
        },
        "tzlocal": {
            "hashes": [
                "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
@@ -589,6 +850,7 @@
                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
            ],
            "markers": "python_version >= '3.1'",
            "version": "==3.0.4"
        },
        "coverage": {
@@ -711,22 +973,6 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==1.2.0"
        },
        "importlib-metadata": {
            "hashes": [
                "sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132",
                "sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c"
            ],
            "markers": "python_version < '3.8'",
            "version": "==2.1.0"
        },
        "importlib-resources": {
            "hashes": [
                "sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592",
                "sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5"
            ],
            "markers": "python_version < '3.7'",
            "version": "==3.3.0"
        },
        "iniconfig": {
            "hashes": [
                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@@ -1038,14 +1284,6 @@
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==20.2.1"
        },
        "zipp": {
            "hashes": [
                "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
                "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
            ],
            "markers": "python_version < '3.8'",
            "version": "==3.4.0"
        }
    }
 }
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -152,6 +152,117 @@ PAPERLESS_AUTO_LOGIN_USERNAME=<username>
    Defaults to none, which disables this feature.
 OCR settings
 ############
 Paperless uses `OCRmyPDF <https://ocrmypdf.readthedocs.io/en/latest/>`_ for
 performing OCR on documents and images. Paperless uses sensible defaults for
 most settings, but all of them can be configured to your needs.
 PAPERLESS_OCR_LANGUAGE=<lang>
    Customize the language that paperless will attempt to use when
    parsing documents.
    It should be a 3-letter language code consistent with ISO
    639: https://www.loc.gov/standards/iso639-2/php/code_list.php
    Set this to the language most of your documents are written in.
    This can be a combination of multiple languages such as ``deu+eng``,
    in which case tesseract will use whatever language matches best.
    Keep in mind that tesseract uses much more cpu time with multiple
    languages enabled.
    Defaults to "eng".
 PAPERLESS_OCR_MODE=<mode>
    Tell paperless when and how to perform ocr on your documents. Four modes
    are available:
    *   ``skip``: Paperless skips all pages and will perform ocr only on pages
        where no text is present. This is the safest and fastest option.
    *   ``skip_noarchive``: In addition to skip, paperless won't create an
        archived version of your documents when it finds any text in them.
    *   ``redo``: Paperless will OCR all pages of your documents and attempt to
        replace any existing text layers with new text. This will be useful for
        documents from scanners that already performed OCR with insufficient
        results. It will also perform OCR on purely digital documents.
        This option may fail on some documents that have features that cannot
        be removed, such as forms. In this case, the text from the document is
        used instead.
    *   ``force``: Paperless rasterizes your documents, converting any text
        into images and puts the OCRed text on top. This works for all documents,
        however, the resulting document may be significantly larger and text
        won't appear as sharp when zoomed in.
    The default is ``skip``, which only performs OCR when necessary.
 PAPERLESS_OCR_OUTPUT_TYPE=<type>
    Specify the the type of PDF documents that paperless should produce.
    *   ``pdf``: Modify the PDF document as little as possible.
    *   ``pdfa``: Convert PDF documents into PDF/A-2b documents, which is a
        subset of the entire PDF specification and meant for storing
        documents long term.
    *   ``pdfa-1``, ``pdfa-2``, ``pdfa-3`` to specify the exact version of
        PDF/A you wish to use.
    If not specified, ``pdfa`` is used. Remember that paperless also keeps
    the original input file as well as the archived version.
 PAPERLESS_OCR_PAGES=<num>
    Tells paperless to use only the specified amount of pages for OCR. Documents
    with less than the specified amount of pages get OCR'ed completely.
    Specifying 1 here will only use the first page.
    When combined with ``PAPERLESS_OCR_MODE=redo`` or ``PAPERLESS_OCR_MODE=force``,
    paperless will not modify any text it finds on excluded pages and copy it
    verbatim.
    Defaults to 0, which disables this feature and always uses all pages.
 PAPERLESS_OCR_IMAGE_DPI=<num>
    Paperless will OCR any images you put into the system and convert them
    into PDF documents. This is useful if your scanner produces images.
    In order to do so, paperless needs to know the DPI of the image.
    Most images from scanners will have this information embedded and
    paperless will detect and use that information. In case this fails, it
    uses this value as a fallback.
    Set this to the DPI your scanner produces images at.
    Default is none, which causes paperless to fail if no DPI information is
    present in an image.
 PAPERLESS_OCR_USER_ARG=<json>
    OCRmyPDF offers many more options. Use this parameter to specify any
    additional arguments you wish to pass to OCRmyPDF. Since Paperless uses
    the API of OCRmyPDF, you have to specify these in a format that can be
    passed to the API. See `https://ocrmypdf.readthedocs.io/en/latest/api.html#reference`_
    for valid parameters. All command line options are supported, but they
    use underscores instead of dashed.
    .. caution::
        Paperless has been tested to work with the OCR options provided
        above. There are many options that are incompatible with each other,
        so specifying invalid options may prevent paperless from consuming
        any documents.
    Specify arguments as a JSON dictionary. Keep note of lower case booleans
    and double quoted parameter names and strings. Examples:
    .. code:: json
        {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}    
 Software tweaks
 ###############
@@ -193,37 +304,6 @@ PAPERLESS_TIME_ZONE=<timezone>
    Defaults to UTC.
 PAPERLESS_OCR_PAGES=<num>
    Tells paperless to use only the specified amount of pages for OCR. Documents
    with less than the specified amount of pages get OCR'ed completely.
    Specifying 1 here will only use the first page.
    Defaults to 0, which disables this feature and always uses all pages.
 PAPERLESS_OCR_LANGUAGE=<lang>
    Customize the default language that tesseract will attempt to use when
    parsing documents. The default language is used whenever
    * No language could be detected on a document
    * No tesseract data files are available for the detected language
    It should be a 3-letter language code consistent with ISO
    639: https://www.loc.gov/standards/iso639-2/php/code_list.php
    Set this to the language most of your documents are written in.
    Defaults to "eng".
 PAPERLESS_OCR_ALWAYS=<bool>
    By default Paperless does not OCR a document if the text can be retrieved from
    the document directly. Set to true to always OCR documents.
    Defaults to false.
 PAPERLESS_CONSUMER_POLLING=<num>
    If paperless won't find documents added to your consume folder, it might
    not be able to automatically detect filesystem changes. In that case,
@@ -261,18 +341,6 @@ PAPERLESS_CONVERT_TMPDIR=<path>
    Default is none, which disables the temporary directory.
 PAPERLESS_CONVERT_DENSITY=<num>
    This setting has a high impact on the physical size of tmp page files,
    the speed of document conversion, and can affect the accuracy of OCR
    results. Individual results can vary and this setting should be tested
    thoroughly against the documents you are importing to see if it has any
    impacts either negative or positive.
    Testing on limited document sets has shown a setting of 200 can cut the
    size of tmp files by 1/3, and speed up conversion by up to 4x
    with little impact to OCR accuracy.
    Default is 300.
 PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
    Use optipng to optimize thumbnails. This usually reduces the size of
    thumbnails by about 20%, but uses considerable compute time during
@@ -319,8 +387,5 @@ PAPERLESS_CONVERT_BINARY=<path>
 PAPERLESS_GS_BINARY=<path>
    Defaults to "/usr/bin/gs".
 PAPERLESS_UNPAPER_BINARY=<path>
    Defaults to "/usr/bin/unpaper".
 PAPERLESS_OPTIPNG_BINARY=<path>
    Defaults to "/usr/bin/optipng".
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -31,19 +31,24 @@
 #PAPERLESS_STATIC_URL=/static/
 #PAPERLESS_AUTO_LOGIN_USERNAME=
 # OCR settings
 #PAPERLESS_OCR_LANGUAGE=eng
 #PAPERLESS_OCR_MODE=skip
 #PAPERLESS_OCR_OUTPUT_TYPE=pdfa
 #PAPERLESS_OCR_PAGES=1
 #PAPERLESS_OCR_IMAGE_DPI=300
 #PAPERLESS_OCR_USER_ARG={}
 #PAPERLESS_CONVERT_MEMORY_LIMIT=0
 #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
 # Software tweaks
 #PAPERLESS_TASK_WORKERS=1
 #PAPERLESS_THREADS_PER_WORKER=1
 #PAPERLESS_TIME_ZONE=UTC
 #PAPERLESS_OCR_PAGES=1
 #PAPERLESS_OCR_LANGUAGE=eng
 #PAPERLESS_OCR_ALWAYS=false
 #PAPERLESS_CONSUMER_POLLING=10
 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
 #PAPERLESS_CONVERT_MEMORY_LIMIT=0
 #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
 #PAPERLESS_CONVERT_DENSITY=300
 #PAPERLESS_OPTIMIZE_THUMBNAILS=true
 #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
 #PAPERLESS_FILENAME_DATE_ORDER=YMD
@@ -53,5 +58,4 @@
 #PAPERLESS_CONVERT_BINARY=/usr/bin/convert
 #PAPERLESS_GS_BINARY=/usr/bin/gs
 #PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper
 #PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng
--- a/src-ui/src/app/components/document-detail/document-detail.component.html
+++ b/src-ui/src/app/components/document-detail/document-detail.component.html
@@ -5,12 +5,26 @@
        </svg>
        <span class="d-none d-lg-inline"> Delete</span>
    </button>
-    <a [href]="downloadUrl" class="btn btn-sm btn-outline-primary mr-2">
+
    <div class="btn-group mr-2">
        <a [href]="downloadUrl" class="btn btn-sm btn-outline-primary">
            <svg class="buttonicon" fill="currentColor">
                <use xlink:href="assets/bootstrap-icons.svg#download" />
            </svg>
            <span class="d-none d-lg-inline"> Download</span>
        </a>
        <div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version">
          <button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button>
          <div class="dropdown-menu" ngbDropdownMenu>
            <a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a>
          </div>
        </div>
      </div>
    <button type="button" class="btn btn-sm btn-outline-primary" (click)="close()">
        <svg class="buttonicon" fill="currentColor">
            <use xlink:href="assets/bootstrap-icons.svg#x" />
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@@ -4,6 +4,7 @@ import { ActivatedRoute, Router } from '@angular/router';
 import { NgbModal } from '@ng-bootstrap/ng-bootstrap';
 import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent';
 import { PaperlessDocument } from 'src/app/data/paperless-document';
 import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata';
 import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
 import { DocumentListViewService } from 'src/app/services/document-list-view.service';
 import { OpenDocumentsService } from 'src/app/services/open-documents.service';
@@ -23,9 +24,11 @@ export class DocumentDetailComponent implements OnInit {
  documentId: number
  document: PaperlessDocument
  metadata: PaperlessDocumentMetadata
  title: string
  previewUrl: string
  downloadUrl: string
  downloadOriginalUrl: string
  correspondents: PaperlessCorrespondent[]
  documentTypes: PaperlessDocumentType[]
@@ -62,6 +65,7 @@ export class DocumentDetailComponent implements OnInit {
      this.documentId = +paramMap.get('id')
      this.previewUrl = this.documentsService.getPreviewUrl(this.documentId)
      this.downloadUrl = this.documentsService.getDownloadUrl(this.documentId)
      this.downloadOriginalUrl = this.documentsService.getDownloadUrl(this.documentId, true)
      if (this.openDocumentService.getOpenDocument(this.documentId)) {
        this.updateComponent(this.openDocumentService.getOpenDocument(this.documentId))
      } else {
@@ -76,6 +80,9 @@ export class DocumentDetailComponent implements OnInit {
  updateComponent(doc: PaperlessDocument) {
    this.document = doc
    this.documentsService.getMetadata(doc.id).subscribe(result => {
      this.metadata = result
    })
    this.title = doc.title
    this.documentForm.patchValue(doc)
  }
--- a/src-ui/src/app/data/paperless-document-metadata.ts
+++ b/src-ui/src/app/data/paperless-document-metadata.ts
@@ -0,0 +1,11 @@
 export interface PaperlessDocumentMetadata {
  paperless__checksum?: string
  paperless__mime_type?: string
  paperless__filename?: string
  paperless__has_archive_version?: boolean
 }
--- a/src-ui/src/app/services/rest/document.service.ts
+++ b/src-ui/src/app/services/rest/document.service.ts
@@ -1,5 +1,6 @@
 import { Injectable } from '@angular/core';
 import { PaperlessDocument } from 'src/app/data/paperless-document';
 import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata';
 import { AbstractPaperlessService } from './abstract-paperless-service';
 import { HttpClient } from '@angular/common/http';
 import { Observable } from 'rxjs';
@@ -50,20 +51,32 @@ export class DocumentService extends AbstractPaperlessService<PaperlessDocument>
    return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules))
  }
-  getPreviewUrl(id: number): string {
+  getPreviewUrl(id: number, original: boolean = false): string {
-    return this.getResourceUrl(id, 'preview')
+    let url = this.getResourceUrl(id, 'preview')
    if (original) {
      url += "?original=true"
    }
    return url
  }
  getThumbUrl(id: number): string {
    return this.getResourceUrl(id, 'thumb')
  }
-  getDownloadUrl(id: number): string {
+  getDownloadUrl(id: number, original: boolean = false): string {
-    return this.getResourceUrl(id, 'download')
+    let url = this.getResourceUrl(id, 'download')
    if (original) {
      url += "?original=true"
    }
    return url
  }
  uploadDocument(formData) {
    return this.http.post(this.getResourceUrl(null, 'post_document'), formData)
  }
  getMetadata(id: number): Observable<PaperlessDocumentMetadata> {
    return this.http.get<PaperlessDocumentMetadata>(this.getResourceUrl(id, 'metadata'))
  }
 }
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -6,6 +6,7 @@ import os
 import magic
 from django.conf import settings
 from django.db import transaction
 from django.db.models import Q
 from django.utils import timezone
 from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
@@ -13,7 +14,7 @@ from .file_handling import create_source_path_directory
 from .loggers import LoggingMixin
 from .models import Document, FileInfo, Correspondent, DocumentType, Tag
 from .parsers import ParseError, get_parser_class_for_mime_type, \
-    get_supported_file_extensions
+    get_supported_file_extensions, parse_date
 from .signals import (
    document_consumption_finished,
    document_consumption_started
@@ -58,7 +59,7 @@ class Consumer(LoggingMixin):
    def pre_check_duplicate(self):
        with open(self.path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
-        if Document.objects.filter(checksum=checksum).exists():
+        if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501
            if settings.CONSUMER_DELETE_DUPLICATES:
                os.unlink(self.path)
            raise ConsumerError(
@@ -69,6 +70,7 @@ class Consumer(LoggingMixin):
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
        os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
    def try_consume_file(self,
                         path,
@@ -124,7 +126,7 @@ class Consumer(LoggingMixin):
        # This doesn't parse the document yet, but gives us a parser.
-        document_parser = parser_class(self.path, self.logging_group)
+        document_parser = parser_class(self.logging_group)
        # However, this already created working directories which we have to
        # clean up.
@@ -132,13 +134,24 @@ class Consumer(LoggingMixin):
        # Parse the document. This may take some time.
        try:
            self.log("debug", f"Generating thumbnail for {self.filename}...")
            thumbnail = document_parser.get_optimised_thumbnail()
            self.log("debug", "Parsing {}...".format(self.filename))
            document_parser.parse(self.path, mime_type)
            self.log("debug", f"Generating thumbnail for {self.filename}...")
            thumbnail = document_parser.get_optimised_thumbnail(
                self.path, mime_type)
            text = document_parser.get_text()
            date = document_parser.get_date()
            if not date:
                date = parse_date(self.filename, text)
            archive_path = document_parser.get_archive_path()
        except ParseError as e:
            document_parser.cleanup()
            self.log(
                "error",
                f"Error while consuming document {self.filename}: {e}")
            raise ConsumerError(e)
        # Prepare the document classifier.
@@ -180,9 +193,24 @@ class Consumer(LoggingMixin):
                # After everything is in the database, copy the files into
                # place. If this fails, we'll also rollback the transaction.
                # TODO: not required, since this is done by the file handling
                #  logic
                create_source_path_directory(document.source_path)
-                self._write(document, self.path, document.source_path)
+
-                self._write(document, thumbnail, document.thumbnail_path)
+                self._write(document.storage_type,
                            self.path, document.source_path)
                self._write(document.storage_type,
                            thumbnail, document.thumbnail_path)
                if archive_path and os.path.isfile(archive_path):
                    self._write(document.storage_type,
                                archive_path, document.archive_path)
                    with open(archive_path, 'rb') as f:
                        document.archive_checksum = hashlib.md5(
                            f.read()).hexdigest()
                        document.save()
                # Afte performing all database operations and moving files
                # into place, tell paperless where the file is.
@@ -195,6 +223,11 @@ class Consumer(LoggingMixin):
                self.log("debug", "Deleting file {}".format(self.path))
                os.unlink(self.path)
        except Exception as e:
            self.log(
                "error",
                f"The following error occured while consuming "
                f"{self.filename}: {e}"
            )
            raise ConsumerError(e)
        finally:
            document_parser.cleanup()
@@ -259,7 +292,7 @@ class Consumer(LoggingMixin):
            for tag_id in self.override_tag_ids:
                document.tags.add(Tag.objects.get(pk=tag_id))
-    def _write(self, document, source, target):
+    def _write(self, storage_type, source, target):
        with open(source, "rb") as read_file:
            with open(target, "wb") as write_file:
                write_file.write(read_file.read())
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -10,10 +10,13 @@ def create_source_path_directory(source_path):
    os.makedirs(os.path.dirname(source_path), exist_ok=True)
-def delete_empty_directories(directory):
+def delete_empty_directories(directory, root):
    if not os.path.isdir(directory):
        return
    # Go up in the directory hierarchy and try to delete all directories
    directory = os.path.normpath(directory)
-    root = os.path.normpath(settings.ORIGINALS_DIR)
+    root = os.path.normpath(root)
    if not directory.startswith(root + os.path.sep):
        # don't do anything outside our originals folder.
@@ -101,3 +104,8 @@ def generate_filename(doc):
        filename += ".gpg"
    return filename
 def archive_name_from_filename(filename):
    return os.path.splitext(filename)[0] + ".pdf"
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@@ -0,0 +1,89 @@
 import hashlib
 import multiprocessing
 import logging
 import os
 import shutil
 import uuid
 from django.conf import settings
 from django.core.management.base import BaseCommand
 from whoosh.writing import AsyncWriter
 from documents.models import Document
 from ... import index
 from ...mixins import Renderable
 from ...parsers import get_parser_class_for_mime_type
 def handle_document(document):
    mime_type = document.mime_type
    parser_class = get_parser_class_for_mime_type(mime_type)
    parser = parser_class(logging_group=uuid.uuid4())
    parser.parse(document.source_path, mime_type)
    if parser.get_archive_path():
        shutil.copy(parser.get_archive_path(), document.archive_path)
        with document.archive_file as f:
            document.archive_checksum = hashlib.md5(f.read()).hexdigest()
    else:
        logging.getLogger(__name__).warning(
            f"Parser {parser} did not produce an archived document "
            f"for {document.file_name}"
        )
    if parser.get_text():
        document.content = parser.get_text()
    document.save()
    parser.cleanup()
 class Command(Renderable, BaseCommand):
    help = """
        Using the current classification model, assigns correspondents, tags
        and document types to all documents, effectively allowing you to
        back-tag all previously indexed documents with metadata created (or
        modified) after their initial import.
    """.replace("    ", "")
    def __init__(self, *args, **kwargs):
        self.verbosity = 0
        BaseCommand.__init__(self, *args, **kwargs)
    def add_arguments(self, parser):
        parser.add_argument(
            "-f", "--overwrite",
            default=False,
            action="store_true",
            help="Recreates the archived document for documents that already "
                 "have an archived version."
        )
    def handle(self, *args, **options):
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
        overwrite = options["overwrite"]
        documents = Document.objects.all()
        documents_to_process = filter(
            lambda d: overwrite or not os.path.exists(d.archive_path),
            documents
        )
        with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
            list(
                pool.imap(
                    handle_document,
                    list(documents_to_process)
                )
            )
        ix = index.open_index()
        with AsyncWriter(ix) as writer:
            for d in documents_to_process:
                index.update_document(writer, d)
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -7,7 +7,8 @@ from django.core import serializers
 from django.core.management.base import BaseCommand, CommandError
 from documents.models import Document, Correspondent, Tag, DocumentType
-from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
+from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
    EXPORTER_ARCHIVE_NAME
 from paperless.db import GnuPG
 from ...mixins import Renderable
@@ -54,7 +55,6 @@ class Command(Renderable, BaseCommand):
            document = document_map[document_dict["pk"]]
            unique_filename = f"{document.pk:07}_{document.file_name}"
            file_target = os.path.join(self.target, unique_filename)
            thumbnail_name = unique_filename + "-thumbnail.png"
@@ -63,6 +63,14 @@ class Command(Renderable, BaseCommand):
            document_dict[EXPORTER_FILE_NAME] = unique_filename
            document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
            if os.path.exists(document.archive_path):
                archive_name = \
                    f"{document.pk:07}_archive_{document.archive_file_name}"
                archive_target = os.path.join(self.target, archive_name)
                document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
            else:
                archive_target = None
            print(f"Exporting: {file_target}")
            t = int(time.mktime(document.created.timetuple()))
@@ -76,11 +84,18 @@ class Command(Renderable, BaseCommand):
                    f.write(GnuPG.decrypted(document.thumbnail_file))
                    os.utime(thumbnail_target, times=(t, t))
                if archive_target:
                    with open(archive_target, "wb") as f:
                        f.write(GnuPG.decrypted(document.archive_path))
                        os.utime(archive_target, times=(t, t))
            else:
                shutil.copy(document.source_path, file_target)
                shutil.copy(document.thumbnail_path, thumbnail_target)
                if archive_target:
                    shutil.copy(document.archive_path, archive_target)
        manifest += json.loads(
            serializers.serialize("json", Correspondent.objects.all()))
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -7,8 +7,8 @@ from django.core.management import call_command
 from django.core.management.base import BaseCommand, CommandError
 from documents.models import Document
-from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
+from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
-from paperless.db import GnuPG
+    EXPORTER_ARCHIVE_NAME
 from ...file_handling import generate_filename, create_source_path_directory
 from ...mixins import Renderable
@@ -79,23 +79,41 @@ class Command(Renderable, BaseCommand):
                    'appear to be in the source directory.'.format(doc_file)
                )
            if EXPORTER_ARCHIVE_NAME in record:
                archive_file = record[EXPORTER_ARCHIVE_NAME]
                if not os.path.exists(os.path.join(self.source, archive_file)):
                    raise CommandError(
                        f"The manifest file refers to {archive_file} which "
                        f"does not appear to be in the source directory."
                    )
    def _import_files_from_manifest(self):
-        storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
        os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
        for record in self.manifest:
            if not record["model"] == "documents.document":
                continue
            doc_file = record[EXPORTER_FILE_NAME]
            thumb_file = record[EXPORTER_THUMBNAIL_NAME]
            document = Document.objects.get(pk=record["pk"])
            doc_file = record[EXPORTER_FILE_NAME]
            document_path = os.path.join(self.source, doc_file)
            thumb_file = record[EXPORTER_THUMBNAIL_NAME]
            thumbnail_path = os.path.join(self.source, thumb_file)
-            document.storage_type = storage_type
+            if EXPORTER_ARCHIVE_NAME in record:
                archive_file = record[EXPORTER_ARCHIVE_NAME]
                archive_path = os.path.join(self.source, archive_file)
            else:
                archive_path = None
            document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
            document.filename = generate_filename(document)
            if os.path.isfile(document.source_path):
@@ -106,5 +124,7 @@ class Command(Renderable, BaseCommand):
            print(f"Moving {document_path} to {document.source_path}")
            shutil.copy(document_path, document.source_path)
            shutil.copy(thumbnail_path, document.thumbnail_path)
            if archive_path:
                shutil.copy(archive_path, document.archive_path)
            document.save()
--- a/src/documents/migrations/1005_checksums.py
+++ b/src/documents/migrations/1005_checksums.py
@@ -0,0 +1,23 @@
 # Generated by Django 3.1.3 on 2020-11-29 00:48
 from django.db import migrations, models
 class Migration(migrations.Migration):
    dependencies = [
        ('documents', '1004_sanity_check_schedule'),
    ]
    operations = [
        migrations.AddField(
            model_name='document',
            name='archive_checksum',
            field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
        ),
        migrations.AlterField(
            model_name='document',
            name='checksum',
            field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
        ),
    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -11,6 +11,7 @@ from django.db import models
 from django.utils import timezone
 from django.utils.text import slugify
 from documents.file_handling import archive_name_from_filename
 from documents.parsers import get_default_file_extension
@@ -158,9 +159,15 @@ class Document(models.Model):
        max_length=32,
        editable=False,
        unique=True,
-        help_text="The checksum of the original document (before it was "
+        help_text="The checksum of the original document."
-                  "encrypted).  We use this to prevent duplicate document "
+    )
-                  "imports."
+
    archive_checksum = models.CharField(
        max_length=32,
        editable=False,
        blank=True,
        null=True,
        help_text="The checksum of the archived document."
    )
    created = models.DateTimeField(
@@ -225,10 +232,30 @@ class Document(models.Model):
    def source_file(self):
        return open(self.source_path, "rb")
    @property
    def archive_path(self):
        if self.filename:
            fname = archive_name_from_filename(self.filename)
        else:
            fname = "{:07}.pdf".format(self.pk)
        return os.path.join(
            settings.ARCHIVE_DIR,
            fname
        )
    @property
    def archive_file(self):
        return open(self.archive_path, "rb")
    @property
    def file_name(self):
        return slugify(str(self)) + self.file_type
    @property
    def archive_file_name(self):
        return slugify(str(self)) + ".pdf"
    @property
    def file_type(self):
        return get_default_file_extension(self.mime_type)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -131,73 +131,7 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))
-def run_unpaper(pnm, logging_group=None):
+def parse_date(filename, text):
    pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
                    pnm_out)
    logger.debug(f"Execute: {' '.join(command_args)}",
                 extra={'group': logging_group})
    if not subprocess.Popen(command_args,
                            stdout=subprocess.DEVNULL,
                            stderr=subprocess.DEVNULL).wait() == 0:
        raise ParseError(f"Unpaper failed at {command_args}")
    return pnm_out
 class ParseError(Exception):
    pass
 class DocumentParser(LoggingMixin):
    """
    Subclass this to make your own parser.  Have a look at
    `paperless_tesseract.parsers` for inspiration.
    """
    def __init__(self, path, logging_group):
        super().__init__()
        self.logging_group = logging_group
        self.document_path = path
        self.tempdir = tempfile.mkdtemp(
            prefix="paperless-", dir=settings.SCRATCH_DIR)
    def get_thumbnail(self):
        """
        Returns the path to a file we can use as a thumbnail for this document.
        """
        raise NotImplementedError()
    def optimise_thumbnail(self, in_path):
        if settings.OPTIMIZE_THUMBNAILS:
            out_path = os.path.join(self.tempdir, "optipng.png")
            args = (settings.OPTIPNG_BINARY,
                    "-silent", "-o5", in_path, "-out", out_path)
            self.log('debug', f"Execute: {' '.join(args)}")
            if not subprocess.Popen(args).wait() == 0:
                raise ParseError("Optipng failed at {}".format(args))
            return out_path
        else:
            return in_path
    def get_optimised_thumbnail(self):
        return self.optimise_thumbnail(self.get_thumbnail())
    def get_text(self):
        """
        Returns the text from the document and only the text.
        """
        raise NotImplementedError()
    def get_date(self):
    """
    Returns the date of the document.
    """
@@ -217,15 +151,12 @@ class DocumentParser(LoggingMixin):
        )
    date = None
        date_string = None
    next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
        title = os.path.basename(self.document_path)
    # if filename date parsing is enabled, search there first:
    if settings.FILENAME_DATE_ORDER:
-            self.log("info", "Checking document title for date")
+        for m in re.finditer(DATE_REGEX, filename):
            for m in re.finditer(DATE_REGEX, title):
            date_string = m.group(0)
            try:
@@ -235,21 +166,8 @@ class DocumentParser(LoggingMixin):
                continue
            if date is not None and next_year > date.year > 1900:
                    self.log(
                        "info",
                        "Detected document date {} based on string {} "
                        "from document title"
                        "".format(date.isoformat(), date_string)
                    )
                return date
        try:
            # getting text after checking filename will save time if only
            # looking at the filename instead of the whole text
            text = self.get_text()
        except ParseError:
            return None
    # Iterate through all regex matches in text and try to parse the date
    for m in re.finditer(DATE_REGEX, text):
        date_string = m.group(0)
@@ -265,19 +183,64 @@ class DocumentParser(LoggingMixin):
        else:
            date = None
        if date is not None:
            self.log(
                "info",
                "Detected document date {} based on string {}".format(
                    date.isoformat(),
                    date_string
                )
            )
        else:
            self.log("info", "Unable to detect date for document")
    return date
 class ParseError(Exception):
    pass
 class DocumentParser(LoggingMixin):
    """
    Subclass this to make your own parser.  Have a look at
    `paperless_tesseract.parsers` for inspiration.
    """
    def __init__(self, logging_group):
        super().__init__()
        self.logging_group = logging_group
        self.tempdir = tempfile.mkdtemp(
            prefix="paperless-", dir=settings.SCRATCH_DIR)
        self.archive_path = None
        self.text = None
        self.date = None
    def parse(self, document_path, mime_type):
        raise NotImplementedError()
    def get_archive_path(self):
        return self.archive_path
    def get_thumbnail(self, document_path, mime_type):
        """
        Returns the path to a file we can use as a thumbnail for this document.
        """
        raise NotImplementedError()
    def get_optimised_thumbnail(self, document_path, mime_type):
        thumbnail = self.get_thumbnail(document_path, mime_type)
        if settings.OPTIMIZE_THUMBNAILS:
            out_path = os.path.join(self.tempdir, "thumb_optipng.png")
            args = (settings.OPTIPNG_BINARY,
                    "-silent", "-o5", thumbnail, "-out", out_path)
            self.log('debug', f"Execute: {' '.join(args)}")
            if not subprocess.Popen(args).wait() == 0:
                raise ParseError("Optipng failed at {}".format(args))
            return out_path
        else:
            return thumbnail
    def get_text(self):
        return self.text
    def get_date(self):
        return self.date
    def cleanup(self):
        self.log("debug", "Deleting directory {}".format(self.tempdir))
        shutil.rmtree(self.tempdir)
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@@ -67,20 +67,35 @@ def check_sanity():
                f"Original of document {doc.pk} does not exist."))
        else:
            present_files.remove(os.path.normpath(doc.source_path))
            checksum = None
            try:
                with doc.source_file as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
            except OSError as e:
                messages.append(SanityError(
                    f"Cannot read original file of document {doc.pk}: {e}"))
-
+            else:
-            if checksum and not checksum == doc.checksum:
+                if not checksum == doc.checksum:
                    messages.append(SanityError(
                        f"Checksum mismatch of document {doc.pk}. "
                        f"Stored: {doc.checksum}, actual: {checksum}."
                    ))
        if os.path.isfile(doc.archive_path):
            present_files.remove(os.path.normpath(doc.archive_path))
            try:
                with doc.archive_file as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
            except OSError as e:
                messages.append(SanityError(
                    f"Cannot read archive file of document {doc.pk}: {e}"
                ))
            else:
                if not checksum == doc.archive_checksum:
                    messages.append(SanityError(
                        f"Checksum mismatch of archive {doc.pk}. "
                        f"Stored: {doc.checksum}, actual: {checksum}."
                    ))
        if not doc.content:
            messages.append(SanityWarning(
                f"Document {doc.pk} has no content."
--- a/src/documents/settings.py
+++ b/src/documents/settings.py
@@ -2,3 +2,4 @@
 # for exporting/importing commands
 EXPORTER_FILE_NAME = "__exported_file_name__"
 EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
 EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -13,7 +13,7 @@ from rest_framework.reverse import reverse
 from .. import index, matching
 from ..file_handling import delete_empty_directories, generate_filename, \
-    create_source_path_directory
+    create_source_path_directory, archive_name_from_filename
 from ..models import Document, Tag
@@ -169,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs):
@receiver(models.signals.post_delete, sender=Document)
 def cleanup_document_deletion(sender, instance, using, **kwargs):
-    for f in (instance.source_path, instance.thumbnail_path):
+    for f in (instance.source_path,
              instance.archive_path,
              instance.thumbnail_path):
        if os.path.isfile(f):
            try:
                os.unlink(f)
-        except FileNotFoundError:
+                logging.getLogger(__name__).debug(
-            pass  # The file's already gone, so we're cool with it.
+                    f"Deleted file {f}.")
            except OSError as e:
                logging.getLogger(__name__).warning(
                    f"While deleting document {instance.file_name}, the file "
                    f"{f} could not be deleted: {e}"
                )
-    delete_empty_directories(os.path.dirname(instance.source_path))
+    delete_empty_directories(
        os.path.dirname(instance.source_path),
        root=settings.ORIGINALS_DIR
    )
    delete_empty_directories(
        os.path.dirname(instance.archive_path),
        root=settings.ARCHIVE_DIR
    )
 def validate_move(instance, old_path, new_path):
    if not os.path.isfile(old_path):
        # Can't do anything if the old file does not exist anymore.
        logging.getLogger(__name__).fatal(
            f"Document {str(instance)}: File {old_path} has gone.")
        return False
    if os.path.isfile(new_path):
        # Can't do anything if the new file already exists. Skip updating file.
        logging.getLogger(__name__).warning(
            f"Document {str(instance)}: Cannot rename file "
            f"since target path {new_path} already exists.")
        return False
    return True
@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@@ -183,55 +216,90 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
 def update_filename_and_move_files(sender, instance, **kwargs):
    if not instance.filename:
-        # Can't update the filename if there is not filename to begin with
+        # Can't update the filename if there is no filename to begin with
-        # This happens after the consumer creates a new document.
+        # This happens when the consumer creates a new document.
-        # The PK needs to be set first by saving the document once. When this
+        # The document is modified and saved multiple times, and only after
-        # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
+        # everything is done (i.e., the generated filename is final),
-        # renamed anyway. In all other cases, instance.filename will be set.
+        # filename will be set to the location where the consumer has put
        # the file.
        #
        # This will in turn cause this logic to move the file where it belongs.
        return
    old_filename = instance.filename
    old_path = instance.source_path
    new_filename = generate_filename(instance)
    if new_filename == instance.filename:
        # Don't do anything if its the same.
        return
-    new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
+    old_source_path = instance.source_path
    new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
-    if not os.path.isfile(old_path):
+    if not validate_move(instance, old_source_path, new_source_path):
        # Can't do anything if the old file does not exist anymore.
        logging.getLogger(__name__).fatal(
            f"Document {str(instance)}: File {old_path} has gone.")
        return
-    if os.path.isfile(new_path):
+    # archive files are optional, archive checksum tells us if we have one,
-        # Can't do anything if the new file already exists. Skip updating file.
+    # since this is None for documents without archived files.
-        logging.getLogger(__name__).warning(
+    if instance.archive_checksum:
-            f"Document {str(instance)}: Cannot rename file "
+        new_archive_filename = archive_name_from_filename(new_filename)
-            f"since target path {new_path} already exists.")
+        old_archive_path = instance.archive_path
        new_archive_path = os.path.join(settings.ARCHIVE_DIR,
                                        new_archive_filename)
        if not validate_move(instance, old_archive_path, new_archive_path):
            return
-    create_source_path_directory(new_path)
+        create_source_path_directory(new_archive_path)
    else:
        old_archive_path = None
        new_archive_path = None
    create_source_path_directory(new_source_path)
    try:
-        os.rename(old_path, new_path)
+        os.rename(old_source_path, new_source_path)
        if instance.archive_checksum:
            os.rename(old_archive_path, new_archive_path)
        instance.filename = new_filename
        # Don't save here to prevent infinite recursion.
        Document.objects.filter(pk=instance.pk).update(filename=new_filename)
        logging.getLogger(__name__).debug(
-            f"Moved file {old_path} to {new_path}.")
+            f"Moved file {old_source_path} to {new_source_path}.")
        logging.getLogger(__name__).debug(
            f"Moved file {old_archive_path} to {new_archive_path}.")
    except OSError as e:
        instance.filename = old_filename
        # this happens when we can't move a file. If that's the case for the
        # archive file, we try our best to revert the changes.
        try:
            os.rename(new_source_path, old_source_path)
            os.rename(new_archive_path, old_archive_path)
        except:
            # This is fine, since:
            # A: if we managed to move source from A to B, we will also manage
            #  to move it from B to A. If not, we have a serious issue
            #  that's going to get caught by the santiy checker.
            #  all files remain in place and will never be overwritten,
            #  so this is not the end of the world.
            # B: if moving the orignal file failed, nothing has changed anyway.
            pass
    except DatabaseError as e:
-        os.rename(new_path, old_path)
+        os.rename(new_source_path, old_source_path)
        if instance.archive_checksum:
            os.rename(new_archive_path, old_archive_path)
        instance.filename = old_filename
-    if not os.path.isfile(old_path):
+    if not os.path.isfile(old_source_path):
-        delete_empty_directories(os.path.dirname(old_path))
+        delete_empty_directories(os.path.dirname(old_source_path),
                                 root=settings.ORIGINALS_DIR)
    if old_archive_path and not os.path.isfile(old_archive_path):
        delete_empty_directories(os.path.dirname(old_archive_path),
                                 root=settings.ARCHIVE_DIR)
 def set_log_entry(sender, document=None, logging_group=None, **kwargs):
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -12,7 +12,9 @@ from documents.sanity_checker import SanityFailedError
 def index_optimize():
-    index.open_index().optimize()
+    ix = index.open_index()
    writer = AsyncWriter(ix)
    writer.commit(optimize=True)
 def index_reindex():
--- a/src/paperless_tesseract/tests/samples/no-text.png
+++ b/src/paperless_tesseract/tests/samples/no-text.png
--- a/src/documents/tests/samples/documents/archive/0000001.pdf
+++ b/src/documents/tests/samples/documents/archive/0000001.pdf
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -100,6 +100,44 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content_thumbnail)
    def test_download_with_archive(self):
        _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
        content = b"This is a test"
        content_archive = b"This is the same test but archived"
        with open(filename, "wb") as f:
            f.write(content)
        filename = os.path.basename(filename)
        doc = Document.objects.create(title="none", filename=filename,
                                      mime_type="application/pdf")
        with open(doc.archive_path, "wb") as f:
            f.write(content_archive)
        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content_archive)
        response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk))
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content)
        response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content_archive)
        response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk))
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content)
    def test_document_actions_not_existing_file(self):
        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -1,5 +1,6 @@
 import os
 import re
 import shutil
 import tempfile
 from unittest import mock
 from unittest.mock import MagicMock
@@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase):
 class DummyParser(DocumentParser):
-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        # not important during tests
        raise NotImplementedError()
-    def __init__(self, path, logging_group, scratch_dir):
+    def __init__(self, logging_group, scratch_dir, archive_path):
-        super(DummyParser, self).__init__(path, logging_group)
+        super(DummyParser, self).__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
        self.archive_path = archive_path
-    def get_optimised_thumbnail(self):
+    def get_optimised_thumbnail(self, document_path, mime_type):
        return self.fake_thumb
-    def get_text(self):
+    def parse(self, document_path, mime_type):
-        return "The Text"
+        self.text = "The Text"
 class FaultyParser(DocumentParser):
-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        # not important during tests
        raise NotImplementedError()
-    def __init__(self, path, logging_group, scratch_dir):
+    def __init__(self, logging_group, scratch_dir):
-        super(FaultyParser, self).__init__(path, logging_group)
+        super(FaultyParser, self).__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
-    def get_optimised_thumbnail(self):
+    def get_optimised_thumbnail(self, document_path, mime_type):
        return self.fake_thumb
-    def get_text(self):
+    def parse(self, document_path, mime_type):
        raise ParseError("Does not compute.")
@@ -410,11 +412,11 @@ def fake_magic_from_file(file, mime=False):
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 class TestConsumer(DirectoriesMixin, TestCase):
-    def make_dummy_parser(self, path, logging_group):
+    def make_dummy_parser(self, logging_group):
-        return DummyParser(path, logging_group, self.dirs.scratch_dir)
+        return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())
-    def make_faulty_parser(self, path, logging_group):
+    def make_faulty_parser(self, logging_group):
-        return FaultyParser(path, logging_group, self.dirs.scratch_dir)
+        return FaultyParser(logging_group, self.dirs.scratch_dir)
    def setUp(self):
        super(TestConsumer, self).setUp()
@@ -432,8 +434,16 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.consumer = Consumer()
    def get_test_file(self):
-        fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir)
+        src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf")
-        return f
+        dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
        shutil.copy(src, dst)
        return dst
    def get_test_archive_file(self):
        src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf")
        dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf")
        shutil.copy(src, dst)
        return dst
    @override_settings(PAPERLESS_FILENAME_FORMAT=None)
    def testNormalOperation(self):
@@ -455,6 +465,13 @@ class TestConsumer(DirectoriesMixin, TestCase):
            document.thumbnail_path
        ))
        self.assertTrue(os.path.isfile(
            document.archive_path
        ))
        self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
        self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
        self.assertFalse(os.path.isfile(filename))
    def testOverrideFilename(self):
@@ -502,7 +519,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.fail("Should throw exception")
-    def testDuplicates(self):
+    def testDuplicates1(self):
        self.consumer.try_consume_file(self.get_test_file())
        try:
@@ -513,6 +530,21 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.fail("Should throw exception")
    def testDuplicates2(self):
        self.consumer.try_consume_file(self.get_test_file())
        try:
            self.consumer.try_consume_file(self.get_test_archive_file())
        except ConsumerError as e:
            self.assertTrue(str(e).endswith("It is a duplicate."))
            return
        self.fail("Should throw exception")
    def testDuplicates3(self):
        self.consumer.try_consume_file(self.get_test_archive_file())
        self.consumer.try_consume_file(self.get_test_file())
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def testNoParsers(self, m):
        m.return_value = []
--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@@ -0,0 +1,140 @@
 import datetime
 import os
 import shutil
 from unittest import mock
 from uuid import uuid4
 from dateutil import tz
 from django.conf import settings
 from django.test import TestCase, override_settings
 from documents.parsers import parse_date
 from paperless_tesseract.parsers import RasterisedDocumentParser
 class TestDate(TestCase):
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples")
    SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
    def setUp(self):
        os.makedirs(self.SCRATCH, exist_ok=True)
    def tearDown(self):
        shutil.rmtree(self.SCRATCH)
    def test_date_format_1(self):
        text = "lorem ipsum 130218 lorem ipsum"
        self.assertEqual(parse_date("", text), None)
    def test_date_format_2(self):
        text = "lorem ipsum 2018 lorem ipsum"
        self.assertEqual(parse_date("", text), None)
    def test_date_format_3(self):
        text = "lorem ipsum 20180213 lorem ipsum"
        self.assertEqual(parse_date("", text), None)
    def test_date_format_4(self):
        text = "lorem ipsum 13.02.2018 lorem ipsum"
        date = parse_date("", text)
        self.assertEqual(
            date,
            datetime.datetime(
                2018, 2, 13, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    def test_date_format_5(self):
        text = (
            "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
            "ipsum"
        )
        date = parse_date("", text)
        self.assertEqual(
            date,
            datetime.datetime(
                2018, 2, 13, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    def test_date_format_6(self):
        text = (
            "lorem ipsum\n"
            "Wohnort\n"
            "3100\n"
            "IBAN\n"
            "AT87 4534\n"
            "1234\n"
            "1234 5678\n"
            "BIC\n"
            "lorem ipsum"
        )
        self.assertEqual(parse_date("", text), None)
    def test_date_format_7(self):
        text = (
            "lorem ipsum\n"
            "März 2019\n"
            "lorem ipsum"
        )
        date = parse_date("", text)
        self.assertEqual(
            date,
            datetime.datetime(
                2019, 3, 1, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    def test_date_format_8(self):
        text = (
            "lorem ipsum\n"
            "Wohnort\n"
            "3100\n"
            "IBAN\n"
            "AT87 4534\n"
            "1234\n"
            "1234 5678\n"
            "BIC\n"
            "lorem ipsum\n"
            "März 2020"
        )
        self.assertEqual(
            parse_date("", text),
            datetime.datetime(
                2020, 3, 1, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_9(self):
        text = (
            "lorem ipsum\n"
            "27. Nullmonth 2020\n"
            "März 2020\n"
            "lorem ipsum"
        )
        self.assertEqual(
            parse_date("", text),
            datetime.datetime(
                2020, 3, 1, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    def test_crazy_date_past(self, *args):
        self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
    def test_crazy_date_future(self, *args):
        self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
    def test_crazy_date_with_spaces(self, *args):
        self.assertIsNone(parse_date("", "20 408000l 2475"))
    @override_settings(FILENAME_DATE_ORDER="YMD")
    def test_filename_date_parse_invalid(self, *args):
        self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
--- a/src/documents/tests/test_document_model.py
+++ b/src/documents/tests/test_document_model.py
@@ -1,12 +1,29 @@
 import os
 import shutil
 import tempfile
 from pathlib import Path
 from unittest import mock
-from django.test import TestCase
+from django.test import TestCase, override_settings
 from ..models import Document, Correspondent
 class TestDocument(TestCase):
    def setUp(self) -> None:
        self.originals_dir = tempfile.mkdtemp()
        self.thumb_dir = tempfile.mkdtemp()
        override_settings(
            ORIGINALS_DIR=self.originals_dir,
            THUMBNAIL_DIR=self.thumb_dir,
        ).enable()
    def tearDown(self) -> None:
        shutil.rmtree(self.originals_dir)
        shutil.rmtree(self.thumb_dir)
    def test_file_deletion(self):
        document = Document.objects.create(
            correspondent=Correspondent.objects.create(name="Test0"),
@@ -19,6 +36,9 @@ class TestDocument(TestCase):
        file_path = document.source_path
        thumb_path = document.thumbnail_path
        Path(file_path).touch()
        Path(thumb_path).touch()
        with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
            document.delete()
            mock_unlink.assert_any_call(file_path)
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@@ -2,32 +2,17 @@ import os
 import shutil
 from pathlib import Path
 from unittest import mock
 from uuid import uuid4
 from django.conf import settings
 from django.db import DatabaseError
 from django.test import TestCase, override_settings
 from .utils import DirectoriesMixin
 from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
 from ..models import Document, Correspondent
-class TestDate(TestCase):
+class TestFileHandling(DirectoriesMixin, TestCase):
    deletion_list = []
    def add_to_deletion_list(self, dirname):
        self.deletion_list.append(dirname)
    def setUp(self):
        folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
        os.makedirs(folder + "/documents/originals")
        override_settings(MEDIA_ROOT=folder).enable()
        override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
        self.add_to_deletion_list(folder)
    def tearDown(self):
        for dirname in self.deletion_list:
            shutil.rmtree(dirname, ignore_errors=True)
    @override_settings(PAPERLESS_FILENAME_FORMAT="")
    def test_generate_source_filename(self):
@@ -104,7 +89,7 @@ class TestDate(TestCase):
        document.save()
        # Check proper handling of files
-        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
+        self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
        self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
        os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
@@ -140,7 +125,7 @@ class TestDate(TestCase):
            # Check proper handling of files
            self.assertTrue(os.path.isfile(document.source_path))
-            self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
+            self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
            self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
@@ -196,8 +181,8 @@ class TestDate(TestCase):
        document.save()
        # Check proper handling of files
-        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
+        self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
-        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
+        self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
        self.assertTrue(os.path.isfile(important_file))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
@@ -315,13 +300,12 @@ class TestDate(TestCase):
        # Create our working directory
        tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
        os.makedirs(tmp)
        self.add_to_deletion_list(tmp)
        os.makedirs(os.path.join(tmp, "notempty"))
        Path(os.path.join(tmp, "notempty", "file")).touch()
        os.makedirs(os.path.join(tmp, "notempty", "empty"))
-        delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
+        delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
        self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
        self.assertEqual(os.path.isfile(
            os.path.join(tmp, "notempty", "file")), True)
@@ -345,3 +329,159 @@ class TestDate(TestCase):
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
        self.assertEqual(generate_filename(document), "0000001.pdf")
 class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
    @override_settings(PAPERLESS_FILENAME_FORMAT=None)
    def test_create_no_format(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
        doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    def test_create_with_format(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
        self.assertFalse(os.path.isfile(original))
        self.assertFalse(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))
        self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
        self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    def test_move_archive_gone(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        #Path(archive).touch()
        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
        self.assertTrue(os.path.isfile(original))
        self.assertFalse(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertFalse(os.path.isfile(doc.archive_path))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    def test_move_archive_exists(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
        os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
        Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    @mock.patch("documents.signals.handlers.os.rename")
    def test_move_archive_error(self, m):
        def fake_rename(src, dst):
            if "archive" in src:
                raise OSError()
            else:
                os.remove(src)
                Path(dst).touch()
        m.side_effect = fake_rename
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    def test_move_file_gone(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        #Path(original).touch()
        Path(archive).touch()
        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
        self.assertFalse(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertFalse(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    @mock.patch("documents.signals.handlers.os.rename")
    def test_move_file_error(self, m):
        def fake_rename(src, dst):
            if "original" in src:
                raise OSError()
            else:
                os.remove(src)
                Path(dst).touch()
        m.side_effect = fake_rename
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))
    def test_archive_deleted(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))
        doc.delete()
        self.assertFalse(os.path.isfile(original))
        self.assertFalse(os.path.isfile(archive))
        self.assertFalse(os.path.isfile(doc.source_path))
        self.assertFalse(os.path.isfile(doc.archive_path))
    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
    def test_database_error(self):
        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
        Path(original).touch()
        Path(archive).touch()
        doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
        with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
            m.side_effect = DatabaseError()
            doc.save()
        self.assertTrue(os.path.isfile(original))
        self.assertTrue(os.path.isfile(archive))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(os.path.isfile(doc.archive_path))
--- a/src/documents/tests/test_management_archiver.py
+++ b/src/documents/tests/test_management_archiver.py
@@ -0,0 +1,42 @@
 import filecmp
 import os
 import shutil
 from django.core.management import call_command
 from django.test import TestCase
 from documents.management.commands.document_archiver import handle_document
 from documents.models import Document
 from documents.tests.utils import DirectoriesMixin
 sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
 class TestArchiver(DirectoriesMixin, TestCase):
    def make_models(self):
        self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
        #self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
        #self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
    def test_archiver(self):
        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
        self.make_models()
        call_command('document_archiver')
    def test_handle_document(self):
        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
        self.make_models()
        handle_document(self.d1)
        doc = Document.objects.get(id=self.d1.id)
        self.assertIsNotNone(doc.checksum)
        self.assertTrue(os.path.isfile(doc.archive_path))
        self.assertTrue(os.path.isfile(doc.source_path))
        self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -23,10 +23,7 @@ class TestExporter(DirectoriesMixin, TestCase):
        file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
-        with open(file, "rb") as f:
+        Document.objects.create(checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
            checksum = hashlib.md5(f.read()).hexdigest()
        Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
        Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
        Tag.objects.create(name="t")
        DocumentType.objects.create(name="dt")
@@ -51,6 +48,14 @@ class TestExporter(DirectoriesMixin, TestCase):
                    checksum = hashlib.md5(f.read()).hexdigest()
                self.assertEqual(checksum, element['fields']['checksum'])
                if document_exporter.EXPORTER_ARCHIVE_NAME in element:
                    fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
                    self.assertTrue(os.path.exists(fname))
                    with open(fname, "rb") as f:
                        checksum = hashlib.md5(f.read()).hexdigest()
                    self.assertEqual(checksum, element['fields']['archive_checksum'])
        Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")
        self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@@ -1,11 +1,13 @@
 import os
 import shutil
 import tempfile
 from tempfile import TemporaryDirectory
 from unittest import mock
-from django.test import TestCase
+from django.test import TestCase, override_settings
 from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
-    get_parser_class_for_mime_type
+    get_parser_class_for_mime_type, DocumentParser
 from paperless_tesseract.parsers import RasterisedDocumentParser
 from paperless_text.parsers import TextDocumentParser
@@ -66,6 +68,38 @@ class TestParserDiscovery(TestCase):
            )
 def fake_get_thumbnail(self, path, mimetype):
    return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
 class TestBaseParser(TestCase):
    def setUp(self) -> None:
        self.scratch = tempfile.mkdtemp()
        override_settings(
            SCRATCH_DIR=self.scratch
        ).enable()
    def tearDown(self) -> None:
        shutil.rmtree(self.scratch)
    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
    @override_settings(OPTIMIZE_THUMBNAILS=True)
    def test_get_optimised_thumbnail(self):
        parser = DocumentParser(None)
        parser.get_optimised_thumbnail("any", "not important")
    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
    @override_settings(OPTIMIZE_THUMBNAILS=False)
    def test_get_optimised_thumb_disabled(self):
        parser = DocumentParser(None)
        path = parser.get_optimised_thumbnail("any", "not important")
        self.assertEqual(path, fake_get_thumbnail(None, None, None))
 class TestParserAvailability(TestCase):
    def test_file_extensions(self):
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -17,10 +17,12 @@ def setup_directories():
    dirs.index_dir = os.path.join(dirs.data_dir, "index")
    dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
    dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
    dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")
    os.makedirs(dirs.index_dir, exist_ok=True)
    os.makedirs(dirs.originals_dir, exist_ok=True)
    os.makedirs(dirs.thumbnail_dir, exist_ok=True)
    os.makedirs(dirs.archive_dir, exist_ok=True)
    override_settings(
        DATA_DIR=dirs.data_dir,
@@ -28,6 +30,7 @@ def setup_directories():
        MEDIA_ROOT=dirs.media_dir,
        ORIGINALS_DIR=dirs.originals_dir,
        THUMBNAIL_DIR=dirs.thumbnail_dir,
        ARCHIVE_DIR=dirs.archive_dir,
        CONSUMPTION_DIR=dirs.consumption_dir,
        INDEX_DIR=dirs.index_dir,
        MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -1,3 +1,5 @@
 import os
 from django.db.models import Count, Max
 from django.http import HttpResponse, HttpResponseBadRequest, Http404
 from django.views.decorators.cache import cache_control
@@ -126,17 +128,30 @@ class DocumentViewSet(RetrieveModelMixin,
        index.remove_document_from_index(self.get_object())
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
-    def file_response(self, pk, disposition):
+    @staticmethod
    def original_requested(request):
        return (
            'original' in request.query_params and
            request.query_params['original'] == 'true'
        )
    def file_response(self, pk, request, disposition):
        doc = Document.objects.get(id=pk)
-
+        if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501
-        if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
+            file_handle = doc.archive_file
-            file_handle = doc.source_file
+            filename = doc.archive_file_name
            mime_type = 'application/pdf'
        else:
-            file_handle = GnuPG.decrypted(doc.source_file)
+            file_handle = doc.source_file
            filename = doc.file_name
            mime_type = doc.mime_type
-        response = HttpResponse(file_handle, content_type=doc.mime_type)
+        if doc.storage_type == Document.STORAGE_TYPE_GPG:
            file_handle = GnuPG.decrypted(file_handle)
        response = HttpResponse(file_handle, content_type=mime_type)
        response["Content-Disposition"] = '{}; filename="{}"'.format(
-            disposition, doc.file_name)
+            disposition, filename)
        return response
    @action(methods=['post'], detail=False)
@@ -157,6 +172,8 @@ class DocumentViewSet(RetrieveModelMixin,
                "paperless__checksum": doc.checksum,
                "paperless__mime_type": doc.mime_type,
                "paperless__filename": doc.filename,
                "paperless__has_archive_version":
                    os.path.isfile(doc.archive_path)
            })
        except Document.DoesNotExist:
            raise Http404()
@@ -164,7 +181,8 @@ class DocumentViewSet(RetrieveModelMixin,
    @action(methods=['get'], detail=True)
    def preview(self, request, pk=None):
        try:
-            response = self.file_response(pk, "inline")
+            response = self.file_response(
                pk, request, "inline")
            return response
        except (FileNotFoundError, Document.DoesNotExist):
            raise Http404()
@@ -181,7 +199,8 @@ class DocumentViewSet(RetrieveModelMixin,
    @action(methods=['get'], detail=True)
    def download(self, request, pk=None):
        try:
-            return self.file_response(pk, "attachment")
+            return self.file_response(
                pk, request, "attachment")
        except (FileNotFoundError, Document.DoesNotExist):
            raise Http404()
--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs):
    binaries = (
        settings.CONVERT_BINARY,
        settings.OPTIPNG_BINARY,
        settings.UNPAPER_BINARY,
        "tesseract"
    )
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta
 MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media"))
 ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals")
 ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
 THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
 DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data"))
@@ -348,9 +349,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
 # OCRmyPDF --output-type options are available.
 # TODO: validate this setting.
 OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
-# OCR all documents?
+# skip. redo, force
-OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
+# TODO: validate this.
 OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
 OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
 OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")
@@ -359,11 +368,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
 CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
 CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
 CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
 CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))
 GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
 OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
 UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")
 # Pre-2.x versions of Paperless stored your documents locally with GPG
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@@ -14,12 +14,21 @@ def get_tesseract_langs():
@register()
 def check_default_language_available(app_configs, **kwargs):
-    langs = get_tesseract_langs()
+    installed_langs = get_tesseract_langs()
-    if settings.OCR_LANGUAGE not in langs:
+    if not settings.OCR_LANGUAGE:
        return [Warning(
            "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
            "This means that tesseract will fallback to english."
        )]
    specified_langs = settings.OCR_LANGUAGE.split("+")
    for lang in specified_langs:
        if lang not in installed_langs:
            return [Error(
-            f"The default ocr language {settings.OCR_LANGUAGE} is "
+                f"The selected ocr language {lang} is "
                f"not installed. Paperless cannot OCR your documents "
                f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
-    else:
+
    return []
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,23 +1,15 @@
-import itertools
+import json
 import os
 import re
 import subprocess
 from multiprocessing.pool import ThreadPool
-import langdetect
+import ocrmypdf
 import pdftotext
 import pyocr
 from PIL import Image
 from django.conf import settings
-from pyocr import PyocrException
+from ocrmypdf import InputFileError
-from documents.parsers import DocumentParser, ParseError, run_unpaper, \
+from documents.parsers import DocumentParser, ParseError, run_convert
    run_convert
 from .languages import ISO639
 class OCRError(Exception):
    pass
 class RasterisedDocumentParser(DocumentParser):
@@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser):
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """
-    def __init__(self, path, logging_group):
+    def get_thumbnail(self, document_path, mime_type):
        super().__init__(path, logging_group)
        self._text = None
    def get_thumbnail(self):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """
@@ -44,7 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
                        alpha="remove",
                        strip=True,
                        trim=True,
-                        input_file="{}[0]".format(self.document_path),
+                        input_file="{}[0]".format(document_path),
                        output_file=out_path,
                        logging_group=self.logging_group)
        except ParseError:
@@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser):
                   "-q",
                   "-sDEVICE=pngalpha",
                   "-o", gs_out_path,
-                   self.document_path]
+                   document_path]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
@@ -74,169 +62,126 @@ class RasterisedDocumentParser(DocumentParser):
        return out_path
-    def _is_ocred(self):
+    def is_image(self, mime_type):
-
+        return mime_type in [
-        # Extract text from PDF using pdftotext
+            "image/png",
-        text = get_text_from_pdf(self.document_path)
+            "image/jpeg"
-
+        ]
        # We assume, that a PDF with at least 50 characters contains text
        # (so no OCR required)
        return len(text) > 50
    def get_text(self):
        if self._text is not None:
            return self._text
        if not settings.OCR_ALWAYS and self._is_ocred():
            self.log("debug", "Skipping OCR, using Text from PDF")
            self._text = get_text_from_pdf(self.document_path)
            return self._text
        images = self._get_greyscale()
        if not images:
            raise ParseError("Empty document, nothing to do.")
    def get_dpi(self, image):
        try:
-
+            with Image.open(image) as im:
-            sample_page_index = int(len(images) / 2)
+                x, y = im.info['dpi']
-            self.log(
+                return x
                "debug",
                f"Attempting language detection on page "
                f"{sample_page_index + 1} of {len(images)}...")
            sample_page_text = self._ocr([images[sample_page_index]],
                                         settings.OCR_LANGUAGE)[0]
            guessed_language = self._guess_language(sample_page_text)
            if not guessed_language or guessed_language not in ISO639:
                self.log("warning", "Language detection failed.")
                ocr_pages = self._complete_ocr_default_language(
                    images, sample_page_index, sample_page_text)
            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
                self.log(
                    "debug",
                    f"Detected language: {guessed_language} "
                    f"(default language)")
                ocr_pages = self._complete_ocr_default_language(
                    images, sample_page_index, sample_page_text)
            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501
                self.log(
                    "warning",
                    f"Detected language {guessed_language} is not available "
                    f"on this system.")
                ocr_pages = self._complete_ocr_default_language(
                    images, sample_page_index, sample_page_text)
            else:
                self.log("debug", f"Detected language: {guessed_language}")
                ocr_pages = self._ocr(images, ISO639[guessed_language])
            self.log("debug", "OCR completed.")
            self._text = strip_excess_whitespace(" ".join(ocr_pages))
            return self._text
        except OCRError as e:
            raise ParseError(e)
    def _get_greyscale(self):
        """
        Greyscale images are easier for Tesseract to OCR
        """
        # Convert PDF to multiple PNMs
        input_file = self.document_path
        if settings.OCR_PAGES == 1:
            input_file += "[0]"
        elif settings.OCR_PAGES > 1:
            input_file += f"[0-{settings.OCR_PAGES - 1}]"
        self.log(
            "debug",
            f"Converting document {input_file} into greyscale images")
        output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
        run_convert(density=settings.CONVERT_DENSITY,
                    depth="8",
                    type="grayscale",
                    input_file=input_file,
                    output_file=output_files,
                    logging_group=self.logging_group)
        # Get a list of converted images
        pnms = []
        for f in os.listdir(self.tempdir):
            if f.endswith(".pnm"):
                pnms.append(os.path.join(self.tempdir, f))
        self.log("debug", f"Running unpaper on {len(pnms)} pages...")
        # Run unpaper in parallel on converted images
        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
            pnms = pool.map(run_unpaper, pnms)
        return sorted(filter(lambda __: os.path.isfile(__), pnms))
    def _guess_language(self, text):
        try:
            guess = langdetect.detect(text)
            return guess
        except Exception as e:
-            self.log('warning', f"Language detection failed with: {e}")
+            self.log(
                'warning',
                f"Error while getting DPI from image {image}: {e}")
            return None
-    def _ocr(self, imgs, lang):
+    def parse(self, document_path, mime_type):
        if settings.OCR_MODE == "skip_noarchive":
            text = get_text_from_pdf(document_path)
            if text and len(text) > 50:
                self.text = text
                return
        archive_path = os.path.join(self.tempdir, "archive.pdf")
        ocr_args = {
            'input_file': document_path,
            'output_file': archive_path,
            'use_threads': True,
            'jobs': settings.THREADS_PER_WORKER,
            'language': settings.OCR_LANGUAGE,
            'output_type': settings.OCR_OUTPUT_TYPE,
            'progress_bar': False,
            'clean': True
        }
        if settings.OCR_PAGES > 0:
            ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
        if settings.OCR_MODE in ['skip', 'skip_noarchive']:
            ocr_args['skip_text'] = True
        elif settings.OCR_MODE == 'redo':
            ocr_args['redo_ocr'] = True
        elif settings.OCR_MODE == 'force':
            ocr_args['force_ocr'] = True
        if self.is_image(mime_type):
            dpi = self.get_dpi(document_path)
            if dpi:
                self.log(
                    "debug",
-            f"Performing OCR on {len(imgs)} page(s) with language {lang}")
+                    f"Detected DPI for image {document_path}: {dpi}"
-        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
+                )
-            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
+                ocr_args['image_dpi'] = dpi
-            return r
+            elif settings.OCR_IMAGE_DPI:
-
+                ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
    def _complete_ocr_default_language(self,
                                       images,
                                       sample_page_index,
                                       sample_page):
        images_copy = list(images)
        del images_copy[sample_page_index]
        if images_copy:
            self.log('debug', "Continuing ocr with default language.")
            ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
            ocr_pages.insert(sample_page_index, sample_page)
            return ocr_pages
            else:
-            return [sample_page]
+                raise ParseError(
                    f"Cannot produce archive PDF for image {document_path}, "
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.")
        if settings.OCR_USER_ARGS:
            try:
                user_args = json.loads(settings.OCR_USER_ARGS)
                ocr_args = {**ocr_args, **user_args}
            except Exception as e:
                self.log(
                    "warning",
                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
                    f"they will not be used: {e}")
        # This forces tesseract to use one core per page.
        os.environ['OMP_THREAD_LIMIT'] = "1"
        try:
            self.log("debug",
                     f"Calling OCRmyPDF with {str(ocr_args)}")
            ocrmypdf.ocr(**ocr_args)
            # success! announce results
            self.archive_path = archive_path
            self.text = get_text_from_pdf(archive_path)
        except InputFileError as e:
            # This happens with some PDFs when used with the redo_ocr option.
            # This is not the end of the world, we'll just use what we already
            # have in the document.
            self.text = get_text_from_pdf(document_path)
            # Also, no archived file.
            if not self.text:
                # However, if we don't have anything, fail:
                raise ParseError(e)
        except Exception as e:
            # Anything else is probably serious.
            raise ParseError(e)
        if not self.text:
            # This may happen for files that don't have any text.
            self.log(
                'warning',
                f"Document {document_path} does not have any text."
                f"This is probably an error or you tried to add an image "
                f"without text.")
            self.text = ""
 def strip_excess_whitespace(text):
    if not text:
        return None
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(
        r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
    no_trailing_whitespace = re.sub(
        r"([^\S\n\r]+)$", '', no_leading_whitespace)
    return no_trailing_whitespace
-
+    # TODO: this needs a rework
-def image_to_string(args):
+    return no_trailing_whitespace.strip()
    img, lang = args
    ocr = pyocr.get_available_tools()[0]
    with Image.open(img) as f:
        if ocr.can_detect_orientation():
            try:
                orientation = ocr.detect_orientation(f, lang=lang)
                f = f.rotate(orientation["angle"], expand=1)
            except Exception:
                # Rotation not possible, ignore
                pass
        try:
            return ocr.image_to_string(f, lang=lang)
        except PyocrException as e:
            raise OCRError(e)
 def get_text_from_pdf(pdf_file):
@@ -245,6 +190,9 @@ def get_text_from_pdf(pdf_file):
        try:
            pdf = pdftotext.PDF(f)
        except pdftotext.Error:
-            return ""
+            # might not be a PDF file
            return None
-    return "\n".join(pdf)
+    text = "\n".join(pdf)
    return strip_excess_whitespace(text)
--- a/src/paperless_tesseract/tests/samples/multi-page-digital.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-digital.pdf
--- a/src/paperless_tesseract/tests/samples/multi-page-images.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-images.pdf
--- a/src/paperless_tesseract/tests/samples/no-text-alpha.png
+++ b/src/paperless_tesseract/tests/samples/no-text-alpha.png
--- a/src/paperless_tesseract/tests/samples/simple-alpha.png
+++ b/src/paperless_tesseract/tests/samples/simple-alpha.png
--- a/src/paperless_tesseract/tests/samples/simple-digital.pdf
+++ b/src/paperless_tesseract/tests/samples/simple-digital.pdf
--- a/src/paperless_tesseract/tests/samples/simple-no-dpi.png
+++ b/src/paperless_tesseract/tests/samples/simple-no-dpi.png
--- a/src/paperless_tesseract/tests/samples/simple.png
+++ b/src/paperless_tesseract/tests/samples/simple.png
--- a/src/paperless_tesseract/tests/samples/with-form.pdf
+++ b/src/paperless_tesseract/tests/samples/with-form.pdf
--- a/src/paperless_tesseract/tests/test_date.py
+++ b/src/paperless_tesseract/tests/test_date.py
@@ -1,193 +0,0 @@
 import datetime
 import os
 import shutil
 from unittest import mock
 from uuid import uuid4
 from dateutil import tz
 from django.conf import settings
 from django.test import TestCase, override_settings
 from ..parsers import RasterisedDocumentParser
 class TestDate(TestCase):
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
    SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
    def setUp(self):
        os.makedirs(self.SCRATCH, exist_ok=True)
    def tearDown(self):
        shutil.rmtree(self.SCRATCH)
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_1(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = "lorem ipsum 130218 lorem ipsum"
        self.assertEqual(document.get_date(), None)
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_2(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = "lorem ipsum 2018 lorem ipsum"
        self.assertEqual(document.get_date(), None)
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_3(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = "lorem ipsum 20180213 lorem ipsum"
        self.assertEqual(document.get_date(), None)
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_4(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = "lorem ipsum 13.02.2018 lorem ipsum"
        date = document.get_date()
        self.assertEqual(
            date,
            datetime.datetime(
                2018, 2, 13, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_5(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = (
            "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
            "ipsum"
        )
        date = document.get_date()
        self.assertEqual(
            date,
            datetime.datetime(
                2018, 2, 13, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_6(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = (
            "lorem ipsum\n"
            "Wohnort\n"
            "3100\n"
            "IBAN\n"
            "AT87 4534\n"
            "1234\n"
            "1234 5678\n"
            "BIC\n"
            "lorem ipsum"
        )
        self.assertEqual(document.get_date(), None)
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_7(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = (
            "lorem ipsum\n"
            "März 2019\n"
            "lorem ipsum"
        )
        date = document.get_date()
        self.assertEqual(
            date,
            datetime.datetime(
                2019, 3, 1, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_8(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = (
            "lorem ipsum\n"
            "Wohnort\n"
            "3100\n"
            "IBAN\n"
            "AT87 4534\n"
            "1234\n"
            "1234 5678\n"
            "BIC\n"
            "lorem ipsum\n"
            "März 2020"
        )
        self.assertEqual(
            document.get_date(),
            datetime.datetime(
                2020, 3, 1, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_date_format_9(self):
        input_file = os.path.join(self.SAMPLE_FILES, "")
        document = RasterisedDocumentParser(input_file, None)
        document._text = (
            "lorem ipsum\n"
            "27. Nullmonth 2020\n"
            "März 2020\n"
            "lorem ipsum"
        )
        self.assertEqual(
            document.get_date(),
            datetime.datetime(
                2020, 3, 1, 0, 0,
                tzinfo=tz.gettz(settings.TIME_ZONE)
            )
        )
    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
        return_value="01-07-0590 00:00:00"
    )
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_crazy_date_past(self, *args):
        document = RasterisedDocumentParser("/dev/null", None)
        document.get_text()
        self.assertIsNone(document.get_date())
    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
        return_value="01-07-2350 00:00:00"
    )
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_crazy_date_future(self, *args):
        document = RasterisedDocumentParser("/dev/null", None)
        document.get_text()
        self.assertIsNone(document.get_date())
    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
        return_value="20 408000l 2475"
    )
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_crazy_date_with_spaces(self, *args):
        document = RasterisedDocumentParser("/dev/null", None)
        document.get_text()
        self.assertIsNone(document.get_date())
    @mock.patch(
        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
        return_value="No date in here"
    )
    @override_settings(FILENAME_DATE_ORDER="YMD")
    @override_settings(SCRATCH_DIR=SCRATCH)
    def test_filename_date_parse_invalid(self, *args):
        document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None)
        document.get_text()
        self.assertIsNone(document.get_date())
--- a/src/paperless_tesseract/tests/test_ocr.py
+++ b/src/paperless_tesseract/tests/test_ocr.py
@@ -1,76 +0,0 @@
 import os
 from unittest import mock, skipIf
 import pyocr
 from django.test import TestCase
 from pyocr.libtesseract.tesseract_raw import \
    TesseractError as OtherTesseractError
 from ..parsers import image_to_string, strip_excess_whitespace
 class FakeTesseract(object):
    @staticmethod
    def can_detect_orientation():
        return True
    @staticmethod
    def detect_orientation(file_handle, lang):
        raise OtherTesseractError("arbitrary status", "message")
    @staticmethod
    def image_to_string(file_handle, lang):
        return "This is test text"
 class FakePyOcr(object):
    @staticmethod
    def get_available_tools():
        return [FakeTesseract]
 class TestOCR(TestCase):
    text_cases = [
        ("simple     string", "simple string"),
        (
            "simple    newline\n   testing string",
            "simple newline\ntesting string"
        ),
        (
            "utf-8   строка с пробелами в конце  ",
            "utf-8 строка с пробелами в конце"
        )
    ]
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
    def test_strip_excess_whitespace(self):
        for source, result in self.text_cases:
            actual_result = strip_excess_whitespace(source)
            self.assertEqual(
                result,
                actual_result,
                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
                    source,
                    result,
                    actual_result
                )
            )
    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
    @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
    def test_image_to_string_with_text_free_page(self):
        """
        This test is sort of silly, since it's really just reproducing an odd
        exception thrown by pyocr when it encounters a page with no text.
        Actually running this test against an installation of Tesseract results
        in a segmentation fault rooted somewhere deep inside pyocr where I
        don't care to dig.  Regardless, if you run the consumer normally,
        text-free pages are now handled correctly so long as we work around
        this weird exception.
        """
        image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"])
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -1,46 +1,17 @@
 import os
 import shutil
 import tempfile
 import uuid
 from typing import ContextManager
 from unittest import mock
 from django.test import TestCase, override_settings
 from pyocr.error import TesseractError
 from documents.parsers import ParseError, run_convert
-from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError
+from documents.tests.utils import DirectoriesMixin
 from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace
 image_to_string_calls = []
 class FakeTesseract(object):
    @staticmethod
    def can_detect_orientation():
        return True
    @staticmethod
    def detect_orientation(file_handle, lang):
        raise TesseractError("arbitrary status", "message")
    @staticmethod
    def get_available_languages():
        return ['eng', 'deu']
    @staticmethod
    def image_to_string(file_handle, lang):
        image_to_string_calls.append((file_handle.name, lang))
        return file_handle.read()
 class FakePyOcr(object):
    @staticmethod
    def get_available_tools():
        return [FakeTesseract]
 def fake_convert(input_file, output_file, **kwargs):
    with open(input_file) as f:
        lines = f.readlines()
@@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs):
            f2.write(line.strip())
 def fake_unpaper(pnm):
    output = pnm + ".unpaper.pnm"
    shutil.copy(pnm, output)
    return output
 class FakeImageFile(ContextManager):
    def __init__(self, fname):
        self.fname = fname
@@ -67,142 +32,50 @@ class FakeImageFile(ContextManager):
        return os.path.basename(self.fname)
 fake_image = FakeImageFile
-@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
+class TestParser(DirectoriesMixin, TestCase):
@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert)
@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper)
@mock.patch("paperless_tesseract.parsers.Image.open", open)
 class TestRasterisedDocumentParser(TestCase):
-    def setUp(self):
+    def assertContainsStrings(self, content, strings):
-        self.scratch = tempfile.mkdtemp()
+        # Asserts that all strings appear in content, in the given order.
        indices = [content.index(s) for s in strings]
        self.assertListEqual(indices, sorted(indices))
-        global image_to_string_calls
+    text_cases = [
        ("simple     string", "simple string"),
        (
            "simple    newline\n   testing string",
            "simple newline\ntesting string"
        ),
        (
            "utf-8   строка с пробелами в конце  ",
            "utf-8 строка с пробелами в конце"
        )
    ]
-        image_to_string_calls = []
+    def test_strip_excess_whitespace(self):
-
+        for source, result in self.text_cases:
-        override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable()
+            actual_result = strip_excess_whitespace(source)
-
+            self.assertEqual(
-    def tearDown(self):
+                result,
-        shutil.rmtree(self.scratch)
+                actual_result,
-
+                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
-    def get_input_file(self, pages):
+                    source,
-        _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch)
+                    result,
-        with open(fname, "w") as f:
+                    actual_result
-            f.writelines([f"line {p}\n" for p in range(pages)])
+                )
-        return fname
+            )
    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
    def test_parse_text_simple_language_match(self):
        parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4())
        text = parser.get_text()
        self.assertEqual(text, "line 0")
        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"])
    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
    def test_parse_text_2_pages(self):
        parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4())
        text = parser.get_text()
        self.assertEqual(text, "line 0 line 1")
        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"])
    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
    def test_parse_text_3_pages(self):
        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
        text = parser.get_text()
        self.assertEqual(text, "line 0 line 1 line 2")
        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None)
    def test_parse_text_lang_detect_failed(self):
        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
        text = parser.get_text()
        self.assertEqual(text, "line 0 line 1 line 2")
        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it")
    def test_parse_text_lang_not_installed(self):
        parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4())
        text = parser.get_text()
        self.assertEqual(text, "line 0 line 1 line 2 line 3")
        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"])
    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
    def test_parse_text_lang_mismatch(self):
        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
        text = parser.get_text()
        self.assertEqual(text, "line 0 line 1 line 2")
        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"])
    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
    def test_parse_empty_doc(self):
        parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4())
        try:
            parser.get_text()
        except ParseError as e:
            self.assertEqual("Empty document, nothing to do.", str(e))
        else:
            self.fail("Should raise exception")
 class TestAuxilliaryFunctions(TestCase):
    def setUp(self):
        self.scratch = tempfile.mkdtemp()
        override_settings(SCRATCH_DIR=self.scratch).enable()
    def tearDown(self):
        shutil.rmtree(self.scratch)
    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
    def test_get_text_from_pdf(self):
-        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf'))
+        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'))
-        self.assertEqual(text.strip(), "This is a test document.")
+        self.assertContainsStrings(text.strip(), ["This is a test document."])
    def test_get_text_from_pdf_error(self):
        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png'))
        self.assertEqual(text.strip(), "")
    def test_image_to_string(self):
        text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng"))
        self.assertEqual(text, "This is a test document.")
    def test_image_to_string_language_unavailable(self):
        try:
            image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita"))
        except OCRError as e:
            self.assertTrue("Failed loading language" in str(e))
        else:
            self.fail("Should raise exception")
    @override_settings(OCR_ALWAYS=False)
    @mock.patch("paperless_tesseract.parsers.get_text_from_pdf")
    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale")
    def test_is_ocred(self, m2, m):
        parser = RasterisedDocumentParser("", uuid.uuid4())
        m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \
                         "lots of text lots of text lots of text lots of text lots of text lots of text " \
                         "lots of text lots of text lots of text lots of text lots of text lots of text "
        parser.get_text()
        self.assertEqual(m.call_count, 2)
        self.assertEqual(m2.call_count, 0)
    def test_thumbnail(self):
-        parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
+        parser = RasterisedDocumentParser(uuid.uuid4())
-        parser.get_thumbnail()
+        parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
        # dont really know how to test it, just call it and assert that it does not raise anything.
    @mock.patch("paperless_tesseract.parsers.run_convert")
@@ -216,6 +89,161 @@ class TestAuxilliaryFunctions(TestCase):
        m.side_effect = call_convert
-        parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
+        parser = RasterisedDocumentParser(uuid.uuid4())
-        parser.get_thumbnail()
+        parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
        # dont really know how to test it, just call it and assert that it does not raise anything.
    def test_get_dpi(self):
        parser = RasterisedDocumentParser(None)
        dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
        self.assertEqual(dpi, None)
        dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png"))
        self.assertEqual(dpi, 72)
    def test_simple_digital(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
    def test_with_form(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
    @override_settings(OCR_MODE="redo")
    def test_with_form_error(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
    @override_settings(OCR_MODE="redo")
    @mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None)
    def test_with_form_error_notext(self):
        parser = RasterisedDocumentParser(None)
        def f():
            parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
        self.assertRaises(ParseError, f)
    @override_settings(OCR_MODE="force")
    def test_with_form_force(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
    def test_image_simple(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
    def test_image_simple_alpha_fail(self):
        parser = RasterisedDocumentParser(None)
        def f():
            parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png")
        self.assertRaises(ParseError, f)
    def test_image_no_dpi_fail(self):
        parser = RasterisedDocumentParser(None)
        def f():
            parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
        self.assertRaises(ParseError, f)
    @override_settings(OCR_IMAGE_DPI=72)
    def test_image_no_dpi_default(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."])
    def test_multi_page(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
    @override_settings(OCR_PAGES=2, OCR_MODE="skip")
    def test_multi_page_pages_skip(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
    def test_multi_page_pages_redo(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
    @override_settings(OCR_PAGES=2, OCR_MODE="force")
    def test_multi_page_pages_force(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
    @override_settings(OOCR_MODE="skip")
    def test_multi_page_analog_pages_skip(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
    def test_multi_page_analog_pages_redo(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
        self.assertFalse("page 3" in parser.get_text().lower())
    @override_settings(OCR_PAGES=1, OCR_MODE="force")
    def test_multi_page_analog_pages_force(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
        self.assertTrue(os.path.isfile(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
        self.assertFalse("page 2" in parser.get_text().lower())
        self.assertFalse("page 3" in parser.get_text().lower())
    @override_settings(OCR_MODE="skip_noarchive")
    def test_skip_noarchive_withtext(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
        self.assertIsNone(parser.archive_path)
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
    @override_settings(OCR_MODE="skip_noarchive")
    def test_skip_noarchive_notext(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
        self.assertTrue(os.path.join(parser.archive_path))
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
    This parser directly parses a text document (.txt, .md, or .csv)
    """
-    def __init__(self, path, logging_group):
+    def get_thumbnail(self, document_path, mime_type):
        super().__init__(path, logging_group)
        self._text = None
    def get_thumbnail(self):
        """
        The thumbnail of a text file is just a 500px wide image of the text
        rendered onto a letter-sized page.
@@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
            )
        def read_text():
-            with open(self.document_path, 'r') as src:
+            with open(document_path, 'r') as src:
                lines = [line.strip() for line in src.readlines()]
                text = "\n".join([line for line in lines[:n_lines]])
                return text.replace('"', "'")
@@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):
        return out_path
-    def get_text(self):
+    def parse(self, document_path, mime_type):
-
+        with open(document_path, 'r') as f:
-        if self._text is not None:
+            self.text = f.read()
            return self._text
        with open(self.document_path, 'r') as f:
            self._text = f.read()
        return self._text
 def run_command(*args):