Merge branch 'feature-ocrmypdf' into dev

2025-07-22 17:54:40 -05:00 · 2020-12-01 14:32:09 +01:00 · 2020-12-01 14:32:09 +01:00 · a33082235b
commit a33082235b
parent ec6d01f7a5 f677ed8798
50 changed files with 1751 additions and 951 deletions
--- a/.gitignore
+++ b/.gitignore
@ -76,16 +76,11 @@ scripts/nuke
 /static/

 # Stored PDFs
-/media/documents/originals/*
-/media/documents/thumbnails/*
-
-/data/classification_model.pickle
-/data/db.sqlite3
-/data/index
-
+/media/
+/data/
 /paperless.conf
-/consume
-/export
+/consume/
+/export/
 /src-ui/.vscode

 # this is where the compiled frontend is moved to.
--- a/.travis.yml
+++ b/.travis.yml
@ -1,5 +1,8 @@
 language: python

+dist: focal
+os: linux
+
 jobs:
  include:
    - name: "Paperless on Python 3.6"
@ -33,7 +36,7 @@ jobs:

 before_install:
  - sudo apt-get update -qq
-  - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript
+  - sudo apt-get install -qq libpoppler-cpp-dev unpaper tesseract-ocr imagemagick ghostscript optipng

 install:
  - pip install --upgrade pipenv
--- a/2
+++ b/2
@ -26,7 +26,6 @@ langdetect = "*"
 pdftotext = "*"
 pathvalidate = "*"
 pillow = "*"
-pyocr = "~=0.7.2"
 python-gnupg = "*"
 python-dotenv = "*"
 python-dateutil = "*"
@ -39,6 +38,7 @@ whitenoise = "~=5.2.0"
 watchdog = "*"
 whoosh="~=2.7.4"
 inotifyrecursive = ">=0.3.4"
+ocrmypdf = "*"

 [dev-packages]
 coveralls = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "d266e1f67e3090ec68aa8ecba1e8373351daf89ad5a5ab46524d123bcaf29f62"
+            "sha256": "55c9136777e78d6cd362628cd1fc0c5ff36b437699b92089ce504d598004371d"
        },
        "pipfile-spec": 6,
        "requires": {
@ -44,6 +44,94 @@
            ],
            "version": "==1.17.12"
        },
+        "cffi": {
+            "hashes": [
+                "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e",
+                "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d",
+                "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a",
+                "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec",
+                "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362",
+                "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668",
+                "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c",
+                "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b",
+                "sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654",
+                "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06",
+                "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698",
+                "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2",
+                "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c",
+                "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7",
+                "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009",
+                "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03",
+                "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b",
+                "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909",
+                "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53",
+                "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35",
+                "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26",
+                "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b",
+                "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb",
+                "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293",
+                "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd",
+                "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d",
+                "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3",
+                "sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5",
+                "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d",
+                "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca",
+                "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d",
+                "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775",
+                "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375",
+                "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b",
+                "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b",
+                "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f"
+            ],
+            "version": "==1.14.4"
+        },
+        "chardet": {
+            "hashes": [
+                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+            ],
+            "markers": "python_version >= '3.1'",
+            "version": "==3.0.4"
+        },
+        "coloredlogs": {
+            "hashes": [
+                "sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a",
+                "sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505",
+                "sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==14.0"
+        },
+        "cryptography": {
+            "hashes": [
+                "sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538",
+                "sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f",
+                "sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d",
+                "sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77",
+                "sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b",
+                "sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33",
+                "sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e",
+                "sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb",
+                "sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e",
+                "sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b",
+                "sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7",
+                "sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297",
+                "sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d",
+                "sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7",
+                "sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b",
+                "sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7",
+                "sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4",
+                "sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8",
+                "sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b",
+                "sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851",
+                "sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13",
+                "sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b",
+                "sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3",
+                "sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==3.2.1"
+        },
        "dateparser": {
            "hashes": [
                "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b",
@ -123,6 +211,14 @@
            "index": "pypi",
            "version": "==20.0.4"
        },
+        "humanfriendly": {
+            "hashes": [
+                "sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12",
+                "sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==8.2"
+        },
        "imap-tools": {
            "hashes": [
                "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246",
@ -131,6 +227,13 @@
            "index": "pypi",
            "version": "==0.32.0"
        },
+        "img2pdf": {
+            "hashes": [
+                "sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5",
+                "sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9"
+            ],
+            "version": "==0.4.0"
+        },
        "inotify-simple": {
            "hashes": [
                "sha256:8440ffe49c4ae81a8df57c1ae1eb4b6bfa7acb830099bfb3e305b383005cc128",
@ -164,6 +267,51 @@
            "index": "pypi",
            "version": "==1.0.8"
        },
+        "lxml": {
+            "hashes": [
+                "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d",
+                "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37",
+                "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01",
+                "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2",
+                "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644",
+                "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75",
+                "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80",
+                "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2",
+                "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780",
+                "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98",
+                "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308",
+                "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf",
+                "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388",
+                "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d",
+                "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3",
+                "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8",
+                "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af",
+                "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2",
+                "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e",
+                "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939",
+                "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03",
+                "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d",
+                "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a",
+                "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5",
+                "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a",
+                "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711",
+                "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf",
+                "sha256:91d6dace31b07ab47eeadd3f4384ded2f77b94b30446410cb2c3e660e047f7a7",
+                "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089",
+                "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505",
+                "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b",
+                "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f",
+                "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc",
+                "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e",
+                "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931",
+                "sha256:e1dbb88a937126ab14d219a000728224702e0ec0fc7ceb7131c53606b7a76772",
+                "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc",
+                "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe",
+                "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
+            "version": "==4.6.2"
+        },
        "numpy": {
            "hashes": [
                "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db",
@ -205,6 +353,14 @@
            "markers": "python_version >= '3.6'",
            "version": "==1.19.4"
        },
+        "ocrmypdf": {
+            "hashes": [
+                "sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3",
+                "sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928"
+            ],
+            "index": "pypi",
+            "version": "==11.3.3"
+        },
        "pathtools": {
            "hashes": [
                "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0",
@ -220,6 +376,14 @@
            "index": "pypi",
            "version": "==2.3.0"
        },
+        "pdfminer.six": {
+            "hashes": [
+                "sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a",
+                "sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509"
+            ],
+            "markers": "python_version >= '3.4'",
+            "version": "==20201018"
+        },
        "pdftotext": {
            "hashes": [
                "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6"
@ -227,6 +391,33 @@
            "index": "pypi",
            "version": "==2.1.5"
        },
+        "pikepdf": {
+            "hashes": [
+                "sha256:0829bd5dacd73bb4a37e7575bae523f49603479755563c92ddb55c206700cab1",
+                "sha256:0d2b631077cd6af6e4d1b396208020705842610a6f13fab489d5f9c47916baa2",
+                "sha256:21c98af08fae4ac9fbcad02b613b6768a4ca300fda4cba867f4a4b6f73c2d04b",
+                "sha256:2240372fed30124ddc35b0c15a613f2b687a426ea2f150091e0a0c58cca7a495",
+                "sha256:2a97f5f1403e058d217d7f6861cf51fca200c5687bce0d052f5f2fa89b5bfa22",
+                "sha256:3faaefca0ae80d19891acec8b0dd5e6235f59f2206d82375eb80d090285e9557",
+                "sha256:48ef45b64882901c0d69af3b85d16a19bd0f3e95b43e614fefb53521d8caf36c",
+                "sha256:5212fe41f2323fc7356ba67caa39737fe13080562cff37bcbb74a8094076c8d0",
+                "sha256:56859c32170663c57bd0658189ce44e180533eebe813853446cd6413810be9eb",
+                "sha256:5f8fd1cb3478c5534222018aca24fbbd2bc74460c899bda988ec76722c13caa9",
+                "sha256:74300a32c41b3d578772f6933f23a88b19f74484185e71e5225ce2f7ea5aea78",
+                "sha256:8cbc946bdd217148f4a9c029fcea62f4ae0f67d5346de4c865f4718cd0ddc37f",
+                "sha256:9ceefd30076f732530cf84a1be2ecb2fa9931af932706ded760a6d37c73b96ad",
+                "sha256:ad69c170fda41b07a4c6b668a3128e7a759f50d9aebcfcde0ccff1358abe0423",
+                "sha256:b715fe182189fb6870fab5b0383bb2fb278c88c46eade346b0f4c1ed8818c09d",
+                "sha256:bb01ecf95083ffcb9ad542dc5342ccc1059e46f1395fd966629d36d9cc766b4a",
+                "sha256:bd6328547219cf48cefb4e0a1bc54442910594de1c5a5feae847d9ff3c629031",
+                "sha256:edb128379bb1dea76b5bdbdacf5657a6e4754bacc2049640762725590d8ed905",
+                "sha256:f8e687900557fcd4c51b4e72b9e337fdae9e2c81049d1d80b624bb2e88b5769d",
+                "sha256:fe0ca120e3347c851c34a91041d574f3c588d832023906d8ae18d66d042e8a52",
+                "sha256:fe8e0152672f24d8bfdecc725f97e9013f2de1b41849150959526ca3562bd3ef"
+            ],
+            "markers": "python_version < '3.9'",
+            "version": "==2.2.0"
+        },
        "pillow": {
            "hashes": [
                "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a",
@ -262,6 +453,14 @@
            "index": "pypi",
            "version": "==8.0.1"
        },
+        "pluggy": {
+            "hashes": [
+                "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
+                "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==0.13.1"
+        },
        "psycopg2-binary": {
            "hashes": [
                "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c",
@ -305,13 +504,13 @@
            "index": "pypi",
            "version": "==2.8.6"
        },
-        "pyocr": {
+        "pycparser": {
            "hashes": [
-                "sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179",
-                "sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301"
+                "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
+                "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
            ],
-            "index": "pypi",
-            "version": "==0.7.2"
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==2.20"
        },
        "python-dateutil": {
            "hashes": [
@ -419,6 +618,53 @@
            ],
            "version": "==2020.11.13"
        },
+        "reportlab": {
+            "hashes": [
+                "sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540",
+                "sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c",
+                "sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030",
+                "sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f",
+                "sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5",
+                "sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e",
+                "sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89",
+                "sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2",
+                "sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f",
+                "sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969",
+                "sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53",
+                "sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d",
+                "sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d",
+                "sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457",
+                "sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec",
+                "sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f",
+                "sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778",
+                "sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92",
+                "sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6",
+                "sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077",
+                "sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673",
+                "sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b",
+                "sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48",
+                "sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95",
+                "sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94",
+                "sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589",
+                "sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9",
+                "sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8",
+                "sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d",
+                "sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32",
+                "sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b",
+                "sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a",
+                "sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2",
+                "sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967",
+                "sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5",
+                "sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437",
+                "sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8",
+                "sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab",
+                "sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e",
+                "sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a",
+                "sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8",
+                "sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08"
+            ],
+            "version": "==3.5.55"
+        },
        "scikit-learn": {
            "hashes": [
                "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e",
@ -482,6 +728,13 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==1.15.0"
        },
+        "sortedcontainers": {
+            "hashes": [
+                "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f",
+                "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1"
+            ],
+            "version": "==2.3.0"
+        },
        "sqlparse": {
            "hashes": [
                "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
@ -498,6 +751,14 @@
            "markers": "python_version >= '3.5'",
            "version": "==2.1.0"
        },
+        "tqdm": {
+            "hashes": [
+                "sha256:5c0d04e06ccc0da1bd3fa5ae4550effcce42fcad947b4a6cafa77bdc9b09ff22",
+                "sha256:9e7b8ab0ecbdbf0595adadd5f0ebbb9e69010e0bd48bbb0c15e550bf2a5292df"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==4.54.0"
+        },
        "tzlocal": {
            "hashes": [
                "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44",
@ -589,6 +850,7 @@
                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
            ],
+            "markers": "python_version >= '3.1'",
            "version": "==3.0.4"
        },
        "coverage": {
@ -711,22 +973,6 @@
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==1.2.0"
        },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:030f3b1bdb823ecbe4a9659e14cc861ce5af403fe99863bae173ec5fe00ab132",
-                "sha256:caeee3603f5dcf567864d1be9b839b0bcfdf1383e3e7be33ce2dead8144ff19c"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==2.1.0"
-        },
-        "importlib-resources": {
-            "hashes": [
-                "sha256:7b51f0106c8ec564b1bef3d9c588bc694ce2b92125bbb6278f4f2f5b54ec3592",
-                "sha256:a3d34a8464ce1d5d7c92b0ea4e921e696d86f2aa212e684451cb1482c8d84ed5"
-            ],
-            "markers": "python_version < '3.7'",
-            "version": "==3.3.0"
-        },
        "iniconfig": {
            "hashes": [
                "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
@ -1038,14 +1284,6 @@
            ],
            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
            "version": "==20.2.1"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:102c24ef8f171fd729d46599845e95c7ab894a4cf45f5de11a44cc7444fb1108",
-                "sha256:ed5eee1974372595f9e416cc7bbeeb12335201d8081ca8a0743c954d4446e5cb"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==3.4.0"
        }
    }
 }
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -152,6 +152,117 @@ PAPERLESS_AUTO_LOGIN_USERNAME=<username>

    Defaults to none, which disables this feature.

+OCR settings
+############
+
+Paperless uses `OCRmyPDF <https://ocrmypdf.readthedocs.io/en/latest/>`_ for
+performing OCR on documents and images. Paperless uses sensible defaults for
+most settings, but all of them can be configured to your needs.
+
+
+PAPERLESS_OCR_LANGUAGE=<lang>
+    Customize the language that paperless will attempt to use when
+    parsing documents.
+
+    It should be a 3-letter language code consistent with ISO
+    639: https://www.loc.gov/standards/iso639-2/php/code_list.php
+
+    Set this to the language most of your documents are written in.
+
+    This can be a combination of multiple languages such as ``deu+eng``,
+    in which case tesseract will use whatever language matches best.
+    Keep in mind that tesseract uses much more cpu time with multiple
+    languages enabled.
+
+    Defaults to "eng".
+
+PAPERLESS_OCR_MODE=<mode>
+    Tell paperless when and how to perform ocr on your documents. Four modes
+    are available:
+
+    *   ``skip``: Paperless skips all pages and will perform ocr only on pages
+        where no text is present. This is the safest and fastest option.
+    *   ``skip_noarchive``: In addition to skip, paperless won't create an
+        archived version of your documents when it finds any text in them.
+    *   ``redo``: Paperless will OCR all pages of your documents and attempt to
+        replace any existing text layers with new text. This will be useful for
+        documents from scanners that already performed OCR with insufficient
+        results. It will also perform OCR on purely digital documents.
+
+        This option may fail on some documents that have features that cannot
+        be removed, such as forms. In this case, the text from the document is
+        used instead.
+    *   ``force``: Paperless rasterizes your documents, converting any text
+        into images and puts the OCRed text on top. This works for all documents,
+        however, the resulting document may be significantly larger and text
+        won't appear as sharp when zoomed in.
+    
+    The default is ``skip``, which only performs OCR when necessary.
+
+PAPERLESS_OCR_OUTPUT_TYPE=<type>
+    Specify the the type of PDF documents that paperless should produce.
+    
+    *   ``pdf``: Modify the PDF document as little as possible.
+    *   ``pdfa``: Convert PDF documents into PDF/A-2b documents, which is a
+        subset of the entire PDF specification and meant for storing
+        documents long term.
+    *   ``pdfa-1``, ``pdfa-2``, ``pdfa-3`` to specify the exact version of
+        PDF/A you wish to use.
+    
+    If not specified, ``pdfa`` is used. Remember that paperless also keeps
+    the original input file as well as the archived version.
+
+
+PAPERLESS_OCR_PAGES=<num>
+    Tells paperless to use only the specified amount of pages for OCR. Documents
+    with less than the specified amount of pages get OCR'ed completely.
+
+    Specifying 1 here will only use the first page.
+
+    When combined with ``PAPERLESS_OCR_MODE=redo`` or ``PAPERLESS_OCR_MODE=force``,
+    paperless will not modify any text it finds on excluded pages and copy it
+    verbatim.
+
+    Defaults to 0, which disables this feature and always uses all pages.
+
+
+PAPERLESS_OCR_IMAGE_DPI=<num>
+    Paperless will OCR any images you put into the system and convert them
+    into PDF documents. This is useful if your scanner produces images.
+    In order to do so, paperless needs to know the DPI of the image.
+    Most images from scanners will have this information embedded and
+    paperless will detect and use that information. In case this fails, it
+    uses this value as a fallback.
+
+    Set this to the DPI your scanner produces images at.
+
+    Default is none, which causes paperless to fail if no DPI information is
+    present in an image.
+
+
+PAPERLESS_OCR_USER_ARG=<json>
+    OCRmyPDF offers many more options. Use this parameter to specify any
+    additional arguments you wish to pass to OCRmyPDF. Since Paperless uses
+    the API of OCRmyPDF, you have to specify these in a format that can be
+    passed to the API. See `https://ocrmypdf.readthedocs.io/en/latest/api.html#reference`_
+    for valid parameters. All command line options are supported, but they
+    use underscores instead of dashed.
+
+    .. caution::
+
+        Paperless has been tested to work with the OCR options provided
+        above. There are many options that are incompatible with each other,
+        so specifying invalid options may prevent paperless from consuming
+        any documents.
+
+    Specify arguments as a JSON dictionary. Keep note of lower case booleans
+    and double quoted parameter names and strings. Examples:
+
+    .. code:: json
+
+        {"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}    
+    
+    
 Software tweaks
 ###############

@ -193,37 +304,6 @@ PAPERLESS_TIME_ZONE=<timezone>
    Defaults to UTC.


-
-PAPERLESS_OCR_PAGES=<num>
-    Tells paperless to use only the specified amount of pages for OCR. Documents
-    with less than the specified amount of pages get OCR'ed completely.
-
-    Specifying 1 here will only use the first page.
-
-    Defaults to 0, which disables this feature and always uses all pages.
-
-
-
-PAPERLESS_OCR_LANGUAGE=<lang>
-    Customize the default language that tesseract will attempt to use when
-    parsing documents. The default language is used whenever
-
-    * No language could be detected on a document
-    * No tesseract data files are available for the detected language
-
-    It should be a 3-letter language code consistent with ISO
-    639: https://www.loc.gov/standards/iso639-2/php/code_list.php
-
-    Set this to the language most of your documents are written in.
-
-    Defaults to "eng".
-
-PAPERLESS_OCR_ALWAYS=<bool>
-    By default Paperless does not OCR a document if the text can be retrieved from
-    the document directly. Set to true to always OCR documents.
-
-    Defaults to false.
-
 PAPERLESS_CONSUMER_POLLING=<num>
    If paperless won't find documents added to your consume folder, it might
    not be able to automatically detect filesystem changes. In that case,
@ -261,18 +341,6 @@ PAPERLESS_CONVERT_TMPDIR=<path>

    Default is none, which disables the temporary directory.

-PAPERLESS_CONVERT_DENSITY=<num>
-    This setting has a high impact on the physical size of tmp page files,
-    the speed of document conversion, and can affect the accuracy of OCR
-    results. Individual results can vary and this setting should be tested
-    thoroughly against the documents you are importing to see if it has any
-    impacts either negative or positive.
-    Testing on limited document sets has shown a setting of 200 can cut the
-    size of tmp files by 1/3, and speed up conversion by up to 4x
-    with little impact to OCR accuracy.
-
-    Default is 300.
-
 PAPERLESS_OPTIMIZE_THUMBNAILS=<bool>
    Use optipng to optimize thumbnails. This usually reduces the size of
    thumbnails by about 20%, but uses considerable compute time during
@ -319,8 +387,5 @@ PAPERLESS_CONVERT_BINARY=<path>
 PAPERLESS_GS_BINARY=<path>
    Defaults to "/usr/bin/gs".

-PAPERLESS_UNPAPER_BINARY=<path>
-    Defaults to "/usr/bin/unpaper".
-
 PAPERLESS_OPTIPNG_BINARY=<path>
    Defaults to "/usr/bin/optipng".
--- a/paperless.conf.example
+++ b/paperless.conf.example
@ -31,19 +31,24 @@
 #PAPERLESS_STATIC_URL=/static/
 #PAPERLESS_AUTO_LOGIN_USERNAME=

+# OCR settings
+
+#PAPERLESS_OCR_LANGUAGE=eng
+#PAPERLESS_OCR_MODE=skip
+#PAPERLESS_OCR_OUTPUT_TYPE=pdfa
+#PAPERLESS_OCR_PAGES=1
+#PAPERLESS_OCR_IMAGE_DPI=300
+#PAPERLESS_OCR_USER_ARG={}
+#PAPERLESS_CONVERT_MEMORY_LIMIT=0
+#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
+
 # Software tweaks

 #PAPERLESS_TASK_WORKERS=1
 #PAPERLESS_THREADS_PER_WORKER=1
 #PAPERLESS_TIME_ZONE=UTC
-#PAPERLESS_OCR_PAGES=1
-#PAPERLESS_OCR_LANGUAGE=eng
-#PAPERLESS_OCR_ALWAYS=false
 #PAPERLESS_CONSUMER_POLLING=10
 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false
-#PAPERLESS_CONVERT_MEMORY_LIMIT=0
-#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
-#PAPERLESS_CONVERT_DENSITY=300
 #PAPERLESS_OPTIMIZE_THUMBNAILS=true
 #PAPERLESS_POST_CONSUME_SCRIPT=/path/to/an/arbitrary/script.sh
 #PAPERLESS_FILENAME_DATE_ORDER=YMD
@ -53,5 +58,4 @@

 #PAPERLESS_CONVERT_BINARY=/usr/bin/convert
 #PAPERLESS_GS_BINARY=/usr/bin/gs
-#PAPERLESS_UNPAPER_BINARY=/usr/bin/unpaper
 #PAPERLESS_OPTIPNG_BINARY=/usr/bin/optipng
--- a/src-ui/src/app/components/document-detail/document-detail.component.html
+++ b/src-ui/src/app/components/document-detail/document-detail.component.html
@ -5,12 +5,26 @@
        </svg>
        <span class="d-none d-lg-inline"> Delete</span>
    </button>
-    <a [href]="downloadUrl" class="btn btn-sm btn-outline-primary mr-2">
-        <svg class="buttonicon" fill="currentColor">
-            <use xlink:href="assets/bootstrap-icons.svg#download" />
-        </svg>
-        <span class="d-none d-lg-inline"> Download</span>
-    </a>
+
+    <div class="btn-group mr-2">
+
+        <a [href]="downloadUrl" class="btn btn-sm btn-outline-primary">
+            <svg class="buttonicon" fill="currentColor">
+                <use xlink:href="assets/bootstrap-icons.svg#download" />
+            </svg>
+            <span class="d-none d-lg-inline"> Download</span>
+        </a>
+    
+        <div class="btn-group" ngbDropdown role="group" *ngIf="metadata?.paperless__has_archive_version">
+          <button class="btn btn-sm btn-outline-primary dropdown-toggle-split" ngbDropdownToggle></button>
+          <div class="dropdown-menu" ngbDropdownMenu>
+            <a ngbDropdownItem [href]="downloadOriginalUrl">Download original</a>
+          </div>
+        </div>
+    
+      </div>
+
+
    <button type="button" class="btn btn-sm btn-outline-primary" (click)="close()">
        <svg class="buttonicon" fill="currentColor">
            <use xlink:href="assets/bootstrap-icons.svg#x" />
--- a/src-ui/src/app/components/document-detail/document-detail.component.ts
+++ b/src-ui/src/app/components/document-detail/document-detail.component.ts
@ -4,6 +4,7 @@ import { ActivatedRoute, Router } from '@angular/router';
 import { NgbModal } from '@ng-bootstrap/ng-bootstrap';
 import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent';
 import { PaperlessDocument } from 'src/app/data/paperless-document';
+import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata';
 import { PaperlessDocumentType } from 'src/app/data/paperless-document-type';
 import { DocumentListViewService } from 'src/app/services/document-list-view.service';
 import { OpenDocumentsService } from 'src/app/services/open-documents.service';
@ -23,9 +24,11 @@ export class DocumentDetailComponent implements OnInit {

  documentId: number
  document: PaperlessDocument
+  metadata: PaperlessDocumentMetadata
  title: string
  previewUrl: string
  downloadUrl: string
+  downloadOriginalUrl: string

  correspondents: PaperlessCorrespondent[]
  documentTypes: PaperlessDocumentType[]
@ -62,6 +65,7 @@ export class DocumentDetailComponent implements OnInit {
      this.documentId = +paramMap.get('id')
      this.previewUrl = this.documentsService.getPreviewUrl(this.documentId)
      this.downloadUrl = this.documentsService.getDownloadUrl(this.documentId)
+      this.downloadOriginalUrl = this.documentsService.getDownloadUrl(this.documentId, true)
      if (this.openDocumentService.getOpenDocument(this.documentId)) {
        this.updateComponent(this.openDocumentService.getOpenDocument(this.documentId))
      } else {
@ -76,6 +80,9 @@ export class DocumentDetailComponent implements OnInit {

  updateComponent(doc: PaperlessDocument) {
    this.document = doc
+    this.documentsService.getMetadata(doc.id).subscribe(result => {
+      this.metadata = result
+    })
    this.title = doc.title
    this.documentForm.patchValue(doc)
  }
--- a/src-ui/src/app/data/paperless-document-metadata.ts
+++ b/src-ui/src/app/data/paperless-document-metadata.ts
@ -0,0 +1,11 @@
+export interface PaperlessDocumentMetadata {
+    
+  paperless__checksum?: string
+
+  paperless__mime_type?: string
+
+  paperless__filename?: string
+
+  paperless__has_archive_version?: boolean
+
+}
--- a/src-ui/src/app/services/rest/document.service.ts
+++ b/src-ui/src/app/services/rest/document.service.ts
@ -1,5 +1,6 @@
 import { Injectable } from '@angular/core';
 import { PaperlessDocument } from 'src/app/data/paperless-document';
+import { PaperlessDocumentMetadata } from 'src/app/data/paperless-document-metadata';
 import { AbstractPaperlessService } from './abstract-paperless-service';
 import { HttpClient } from '@angular/common/http';
 import { Observable } from 'rxjs';
@ -50,20 +51,32 @@ export class DocumentService extends AbstractPaperlessService<PaperlessDocument>
    return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules))
  }

-  getPreviewUrl(id: number): string {
-    return this.getResourceUrl(id, 'preview')
+  getPreviewUrl(id: number, original: boolean = false): string {
+    let url = this.getResourceUrl(id, 'preview')
+    if (original) {
+      url += "?original=true"
+    }
+    return url
  }

  getThumbUrl(id: number): string {
    return this.getResourceUrl(id, 'thumb')
  }

-  getDownloadUrl(id: number): string {
-    return this.getResourceUrl(id, 'download')
+  getDownloadUrl(id: number, original: boolean = false): string {
+    let url = this.getResourceUrl(id, 'download')
+    if (original) {
+      url += "?original=true"
+    }
+    return url
  }

  uploadDocument(formData) {
    return this.http.post(this.getResourceUrl(null, 'post_document'), formData)
  }

+  getMetadata(id: number): Observable<PaperlessDocumentMetadata> {
+    return this.http.get<PaperlessDocumentMetadata>(this.getResourceUrl(id, 'metadata'))
+  }
+
 }
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -6,6 +6,7 @@ import os
 import magic
 from django.conf import settings
 from django.db import transaction
+from django.db.models import Q
 from django.utils import timezone

 from .classifier import DocumentClassifier, IncompatibleClassifierVersionError
@ -13,7 +14,7 @@ from .file_handling import create_source_path_directory
 from .loggers import LoggingMixin
 from .models import Document, FileInfo, Correspondent, DocumentType, Tag
 from .parsers import ParseError, get_parser_class_for_mime_type, \
-    get_supported_file_extensions
+    get_supported_file_extensions, parse_date
 from .signals import (
    document_consumption_finished,
    document_consumption_started
@ -58,7 +59,7 @@ class Consumer(LoggingMixin):
    def pre_check_duplicate(self):
        with open(self.path, "rb") as f:
            checksum = hashlib.md5(f.read()).hexdigest()
-        if Document.objects.filter(checksum=checksum).exists():
+        if Document.objects.filter(Q(checksum=checksum) | Q(archive_checksum=checksum)).exists():  # NOQA: E501
            if settings.CONSUMER_DELETE_DUPLICATES:
                os.unlink(self.path)
            raise ConsumerError(
@ -69,6 +70,7 @@ class Consumer(LoggingMixin):
        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
+        os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)

    def try_consume_file(self,
                         path,
@ -124,7 +126,7 @@ class Consumer(LoggingMixin):

        # This doesn't parse the document yet, but gives us a parser.

-        document_parser = parser_class(self.path, self.logging_group)
+        document_parser = parser_class(self.logging_group)

        # However, this already created working directories which we have to
        # clean up.
@ -132,13 +134,24 @@ class Consumer(LoggingMixin):
        # Parse the document. This may take some time.

        try:
-            self.log("debug", f"Generating thumbnail for {self.filename}...")
-            thumbnail = document_parser.get_optimised_thumbnail()
            self.log("debug", "Parsing {}...".format(self.filename))
+            document_parser.parse(self.path, mime_type)
+
+            self.log("debug", f"Generating thumbnail for {self.filename}...")
+            thumbnail = document_parser.get_optimised_thumbnail(
+                self.path, mime_type)
+
            text = document_parser.get_text()
            date = document_parser.get_date()
+            if not date:
+                date = parse_date(self.filename, text)
+            archive_path = document_parser.get_archive_path()
+
        except ParseError as e:
            document_parser.cleanup()
+            self.log(
+                "error",
+                f"Error while consuming document {self.filename}: {e}")
            raise ConsumerError(e)

        # Prepare the document classifier.
@ -180,9 +193,24 @@ class Consumer(LoggingMixin):
                # After everything is in the database, copy the files into
                # place. If this fails, we'll also rollback the transaction.

+                # TODO: not required, since this is done by the file handling
+                #  logic
                create_source_path_directory(document.source_path)
-                self._write(document, self.path, document.source_path)
-                self._write(document, thumbnail, document.thumbnail_path)
+
+                self._write(document.storage_type,
+                            self.path, document.source_path)
+
+                self._write(document.storage_type,
+                            thumbnail, document.thumbnail_path)
+
+                if archive_path and os.path.isfile(archive_path):
+                    self._write(document.storage_type,
+                                archive_path, document.archive_path)
+
+                    with open(archive_path, 'rb') as f:
+                        document.archive_checksum = hashlib.md5(
+                            f.read()).hexdigest()
+                        document.save()

                # Afte performing all database operations and moving files
                # into place, tell paperless where the file is.
@ -195,6 +223,11 @@ class Consumer(LoggingMixin):
                self.log("debug", "Deleting file {}".format(self.path))
                os.unlink(self.path)
        except Exception as e:
+            self.log(
+                "error",
+                f"The following error occured while consuming "
+                f"{self.filename}: {e}"
+            )
            raise ConsumerError(e)
        finally:
            document_parser.cleanup()
@ -259,7 +292,7 @@ class Consumer(LoggingMixin):
            for tag_id in self.override_tag_ids:
                document.tags.add(Tag.objects.get(pk=tag_id))

-    def _write(self, document, source, target):
+    def _write(self, storage_type, source, target):
        with open(source, "rb") as read_file:
            with open(target, "wb") as write_file:
                write_file.write(read_file.read())
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@ -10,10 +10,13 @@ def create_source_path_directory(source_path):
    os.makedirs(os.path.dirname(source_path), exist_ok=True)


-def delete_empty_directories(directory):
+def delete_empty_directories(directory, root):
+    if not os.path.isdir(directory):
+        return
+
    # Go up in the directory hierarchy and try to delete all directories
    directory = os.path.normpath(directory)
-    root = os.path.normpath(settings.ORIGINALS_DIR)
+    root = os.path.normpath(root)

    if not directory.startswith(root + os.path.sep):
        # don't do anything outside our originals folder.
@ -101,3 +104,8 @@ def generate_filename(doc):
        filename += ".gpg"

    return filename
+
+
+def archive_name_from_filename(filename):
+
+    return os.path.splitext(filename)[0] + ".pdf"
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@ -0,0 +1,89 @@
+import hashlib
+import multiprocessing
+
+import logging
+import os
+import shutil
+import uuid
+
+from django.conf import settings
+from django.core.management.base import BaseCommand
+from whoosh.writing import AsyncWriter
+
+from documents.models import Document
+from ... import index
+from ...mixins import Renderable
+from ...parsers import get_parser_class_for_mime_type
+
+
+def handle_document(document):
+    mime_type = document.mime_type
+
+    parser_class = get_parser_class_for_mime_type(mime_type)
+
+    parser = parser_class(logging_group=uuid.uuid4())
+    parser.parse(document.source_path, mime_type)
+    if parser.get_archive_path():
+        shutil.copy(parser.get_archive_path(), document.archive_path)
+        with document.archive_file as f:
+            document.archive_checksum = hashlib.md5(f.read()).hexdigest()
+    else:
+        logging.getLogger(__name__).warning(
+            f"Parser {parser} did not produce an archived document "
+            f"for {document.file_name}"
+        )
+
+    if parser.get_text():
+        document.content = parser.get_text()
+    document.save()
+
+    parser.cleanup()
+
+
+class Command(Renderable, BaseCommand):
+
+    help = """
+        Using the current classification model, assigns correspondents, tags
+        and document types to all documents, effectively allowing you to
+        back-tag all previously indexed documents with metadata created (or
+        modified) after their initial import.
+    """.replace("    ", "")
+
+    def __init__(self, *args, **kwargs):
+        self.verbosity = 0
+        BaseCommand.__init__(self, *args, **kwargs)
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "-f", "--overwrite",
+            default=False,
+            action="store_true",
+            help="Recreates the archived document for documents that already "
+                 "have an archived version."
+        )
+
+    def handle(self, *args, **options):
+
+        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
+
+        overwrite = options["overwrite"]
+
+        documents = Document.objects.all()
+
+        documents_to_process = filter(
+            lambda d: overwrite or not os.path.exists(d.archive_path),
+            documents
+        )
+
+        with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
+            list(
+                pool.imap(
+                    handle_document,
+                    list(documents_to_process)
+                )
+            )
+
+        ix = index.open_index()
+        with AsyncWriter(ix) as writer:
+            for d in documents_to_process:
+                index.update_document(writer, d)
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@ -7,7 +7,8 @@ from django.core import serializers
 from django.core.management.base import BaseCommand, CommandError

 from documents.models import Document, Correspondent, Tag, DocumentType
-from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
+from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
+    EXPORTER_ARCHIVE_NAME
 from paperless.db import GnuPG
 from ...mixins import Renderable

@ -54,7 +55,6 @@ class Command(Renderable, BaseCommand):
            document = document_map[document_dict["pk"]]

            unique_filename = f"{document.pk:07}_{document.file_name}"
-
            file_target = os.path.join(self.target, unique_filename)

            thumbnail_name = unique_filename + "-thumbnail.png"
@ -63,6 +63,14 @@ class Command(Renderable, BaseCommand):
            document_dict[EXPORTER_FILE_NAME] = unique_filename
            document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name

+            if os.path.exists(document.archive_path):
+                archive_name = \
+                    f"{document.pk:07}_archive_{document.archive_file_name}"
+                archive_target = os.path.join(self.target, archive_name)
+                document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
+            else:
+                archive_target = None
+
            print(f"Exporting: {file_target}")

            t = int(time.mktime(document.created.timetuple()))
@ -76,11 +84,18 @@ class Command(Renderable, BaseCommand):
                    f.write(GnuPG.decrypted(document.thumbnail_file))
                    os.utime(thumbnail_target, times=(t, t))

+                if archive_target:
+                    with open(archive_target, "wb") as f:
+                        f.write(GnuPG.decrypted(document.archive_path))
+                        os.utime(archive_target, times=(t, t))
            else:

                shutil.copy(document.source_path, file_target)
                shutil.copy(document.thumbnail_path, thumbnail_target)

+                if archive_target:
+                    shutil.copy(document.archive_path, archive_target)
+
        manifest += json.loads(
            serializers.serialize("json", Correspondent.objects.all()))

--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@ -7,8 +7,8 @@ from django.core.management import call_command
 from django.core.management.base import BaseCommand, CommandError

 from documents.models import Document
-from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME
-from paperless.db import GnuPG
+from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
+    EXPORTER_ARCHIVE_NAME
 from ...file_handling import generate_filename, create_source_path_directory
 from ...mixins import Renderable

@ -79,23 +79,41 @@ class Command(Renderable, BaseCommand):
                    'appear to be in the source directory.'.format(doc_file)
                )

+            if EXPORTER_ARCHIVE_NAME in record:
+                archive_file = record[EXPORTER_ARCHIVE_NAME]
+                if not os.path.exists(os.path.join(self.source, archive_file)):
+                    raise CommandError(
+                        f"The manifest file refers to {archive_file} which "
+                        f"does not appear to be in the source directory."
+                    )
+
    def _import_files_from_manifest(self):

-        storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+        os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
+        os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
+        os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)

        for record in self.manifest:

            if not record["model"] == "documents.document":
                continue

-            doc_file = record[EXPORTER_FILE_NAME]
-            thumb_file = record[EXPORTER_THUMBNAIL_NAME]
            document = Document.objects.get(pk=record["pk"])

+            doc_file = record[EXPORTER_FILE_NAME]
            document_path = os.path.join(self.source, doc_file)
+
+            thumb_file = record[EXPORTER_THUMBNAIL_NAME]
            thumbnail_path = os.path.join(self.source, thumb_file)

-            document.storage_type = storage_type
+            if EXPORTER_ARCHIVE_NAME in record:
+                archive_file = record[EXPORTER_ARCHIVE_NAME]
+                archive_path = os.path.join(self.source, archive_file)
+            else:
+                archive_path = None
+
+            document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
+
            document.filename = generate_filename(document)

            if os.path.isfile(document.source_path):
@ -106,5 +124,7 @@ class Command(Renderable, BaseCommand):
            print(f"Moving {document_path} to {document.source_path}")
            shutil.copy(document_path, document.source_path)
            shutil.copy(thumbnail_path, document.thumbnail_path)
+            if archive_path:
+                shutil.copy(archive_path, document.archive_path)

            document.save()
--- a/src/documents/migrations/1005_checksums.py
+++ b/src/documents/migrations/1005_checksums.py
@ -0,0 +1,23 @@
+# Generated by Django 3.1.3 on 2020-11-29 00:48
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '1004_sanity_check_schedule'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='document',
+            name='archive_checksum',
+            field=models.CharField(blank=True, editable=False, help_text='The checksum of the archived document.', max_length=32, null=True),
+        ),
+        migrations.AlterField(
+            model_name='document',
+            name='checksum',
+            field=models.CharField(editable=False, help_text='The checksum of the original document.', max_length=32, unique=True),
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@ -11,6 +11,7 @@ from django.db import models
 from django.utils import timezone
 from django.utils.text import slugify

+from documents.file_handling import archive_name_from_filename
 from documents.parsers import get_default_file_extension


@ -158,9 +159,15 @@ class Document(models.Model):
        max_length=32,
        editable=False,
        unique=True,
-        help_text="The checksum of the original document (before it was "
-                  "encrypted).  We use this to prevent duplicate document "
-                  "imports."
+        help_text="The checksum of the original document."
+    )
+
+    archive_checksum = models.CharField(
+        max_length=32,
+        editable=False,
+        blank=True,
+        null=True,
+        help_text="The checksum of the archived document."
    )

    created = models.DateTimeField(
@ -225,10 +232,30 @@ class Document(models.Model):
    def source_file(self):
        return open(self.source_path, "rb")

+    @property
+    def archive_path(self):
+        if self.filename:
+            fname = archive_name_from_filename(self.filename)
+        else:
+            fname = "{:07}.pdf".format(self.pk)
+
+        return os.path.join(
+            settings.ARCHIVE_DIR,
+            fname
+        )
+
+    @property
+    def archive_file(self):
+        return open(self.archive_path, "rb")
+
    @property
    def file_name(self):
        return slugify(str(self)) + self.file_type

+    @property
+    def archive_file_name(self):
+        return slugify(str(self)) + ".pdf"
+
    @property
    def file_type(self):
        return get_default_file_extension(self.mime_type)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@ -131,21 +131,59 @@ def run_convert(input_file,
        raise ParseError("Convert failed at {}".format(args))


-def run_unpaper(pnm, logging_group=None):
-    pnm_out = pnm.replace(".pnm", ".unpaper.pnm")
+def parse_date(filename, text):
+    """
+    Returns the date of the document.
+    """

-    command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm,
-                    pnm_out)
+    def __parser(ds, date_order):
+        """
+        Call dateparser.parse with a particular date ordering
+        """
+        return dateparser.parse(
+            ds,
+            settings={
+                "DATE_ORDER": date_order,
+                "PREFER_DAY_OF_MONTH": "first",
+                "RETURN_AS_TIMEZONE_AWARE":
+                True
+            }
+        )

-    logger.debug(f"Execute: {' '.join(command_args)}",
-                 extra={'group': logging_group})
+    date = None

-    if not subprocess.Popen(command_args,
-                            stdout=subprocess.DEVNULL,
-                            stderr=subprocess.DEVNULL).wait() == 0:
-        raise ParseError(f"Unpaper failed at {command_args}")
+    next_year = timezone.now().year + 5  # Arbitrary 5 year future limit

-    return pnm_out
+    # if filename date parsing is enabled, search there first:
+    if settings.FILENAME_DATE_ORDER:
+        for m in re.finditer(DATE_REGEX, filename):
+            date_string = m.group(0)
+
+            try:
+                date = __parser(date_string, settings.FILENAME_DATE_ORDER)
+            except (TypeError, ValueError):
+                # Skip all matches that do not parse to a proper date
+                continue
+
+            if date is not None and next_year > date.year > 1900:
+                return date
+
+    # Iterate through all regex matches in text and try to parse the date
+    for m in re.finditer(DATE_REGEX, text):
+        date_string = m.group(0)
+
+        try:
+            date = __parser(date_string, settings.DATE_ORDER)
+        except (TypeError, ValueError):
+            # Skip all matches that do not parse to a proper date
+            continue
+
+        if date is not None and next_year > date.year > 1900:
+            break
+        else:
+            date = None
+
+    return date


 class ParseError(Exception):
@ -158,26 +196,35 @@ class DocumentParser(LoggingMixin):
    `paperless_tesseract.parsers` for inspiration.
    """

-    def __init__(self, path, logging_group):
+    def __init__(self, logging_group):
        super().__init__()
        self.logging_group = logging_group
-        self.document_path = path
        self.tempdir = tempfile.mkdtemp(
            prefix="paperless-", dir=settings.SCRATCH_DIR)

-    def get_thumbnail(self):
+        self.archive_path = None
+        self.text = None
+        self.date = None
+
+    def parse(self, document_path, mime_type):
+        raise NotImplementedError()
+
+    def get_archive_path(self):
+        return self.archive_path
+
+    def get_thumbnail(self, document_path, mime_type):
        """
        Returns the path to a file we can use as a thumbnail for this document.
        """
        raise NotImplementedError()

-    def optimise_thumbnail(self, in_path):
-
+    def get_optimised_thumbnail(self, document_path, mime_type):
+        thumbnail = self.get_thumbnail(document_path, mime_type)
        if settings.OPTIMIZE_THUMBNAILS:
-            out_path = os.path.join(self.tempdir, "optipng.png")
+            out_path = os.path.join(self.tempdir, "thumb_optipng.png")

            args = (settings.OPTIPNG_BINARY,
-                    "-silent", "-o5", in_path, "-out", out_path)
+                    "-silent", "-o5", thumbnail, "-out", out_path)

            self.log('debug', f"Execute: {' '.join(args)}")

@ -186,97 +233,13 @@ class DocumentParser(LoggingMixin):

            return out_path
        else:
-            return in_path
-
-    def get_optimised_thumbnail(self):
-        return self.optimise_thumbnail(self.get_thumbnail())
+            return thumbnail

    def get_text(self):
-        """
-        Returns the text from the document and only the text.
-        """
-        raise NotImplementedError()
+        return self.text

    def get_date(self):
-        """
-        Returns the date of the document.
-        """
-
-        def __parser(ds, date_order):
-            """
-            Call dateparser.parse with a particular date ordering
-            """
-            return dateparser.parse(
-                ds,
-                settings={
-                    "DATE_ORDER": date_order,
-                    "PREFER_DAY_OF_MONTH": "first",
-                    "RETURN_AS_TIMEZONE_AWARE":
-                    True
-                }
-            )
-
-        date = None
-        date_string = None
-
-        next_year = timezone.now().year + 5  # Arbitrary 5 year future limit
-        title = os.path.basename(self.document_path)
-
-        # if filename date parsing is enabled, search there first:
-        if settings.FILENAME_DATE_ORDER:
-            self.log("info", "Checking document title for date")
-            for m in re.finditer(DATE_REGEX, title):
-                date_string = m.group(0)
-
-                try:
-                    date = __parser(date_string, settings.FILENAME_DATE_ORDER)
-                except (TypeError, ValueError):
-                    # Skip all matches that do not parse to a proper date
-                    continue
-
-                if date is not None and next_year > date.year > 1900:
-                    self.log(
-                        "info",
-                        "Detected document date {} based on string {} "
-                        "from document title"
-                        "".format(date.isoformat(), date_string)
-                    )
-                    return date
-
-        try:
-            # getting text after checking filename will save time if only
-            # looking at the filename instead of the whole text
-            text = self.get_text()
-        except ParseError:
-            return None
-
-        # Iterate through all regex matches in text and try to parse the date
-        for m in re.finditer(DATE_REGEX, text):
-            date_string = m.group(0)
-
-            try:
-                date = __parser(date_string, settings.DATE_ORDER)
-            except (TypeError, ValueError):
-                # Skip all matches that do not parse to a proper date
-                continue
-
-            if date is not None and next_year > date.year > 1900:
-                break
-            else:
-                date = None
-
-        if date is not None:
-            self.log(
-                "info",
-                "Detected document date {} based on string {}".format(
-                    date.isoformat(),
-                    date_string
-                )
-            )
-        else:
-            self.log("info", "Unable to detect date for document")
-
-        return date
+        return self.date

    def cleanup(self):
        self.log("debug", "Deleting directory {}".format(self.tempdir))
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@ -67,19 +67,34 @@ def check_sanity():
                f"Original of document {doc.pk} does not exist."))
        else:
            present_files.remove(os.path.normpath(doc.source_path))
-            checksum = None
            try:
                with doc.source_file as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
            except OSError as e:
                messages.append(SanityError(
                    f"Cannot read original file of document {doc.pk}: {e}"))
+            else:
+                if not checksum == doc.checksum:
+                    messages.append(SanityError(
+                        f"Checksum mismatch of document {doc.pk}. "
+                        f"Stored: {doc.checksum}, actual: {checksum}."
+                    ))

-            if checksum and not checksum == doc.checksum:
+        if os.path.isfile(doc.archive_path):
+            present_files.remove(os.path.normpath(doc.archive_path))
+            try:
+                with doc.archive_file as f:
+                    checksum = hashlib.md5(f.read()).hexdigest()
+            except OSError as e:
                messages.append(SanityError(
-                    f"Checksum mismatch of document {doc.pk}. "
-                    f"Stored: {doc.checksum}, actual: {checksum}."
+                    f"Cannot read archive file of document {doc.pk}: {e}"
                ))
+            else:
+                if not checksum == doc.archive_checksum:
+                    messages.append(SanityError(
+                        f"Checksum mismatch of archive {doc.pk}. "
+                        f"Stored: {doc.checksum}, actual: {checksum}."
+                    ))

        if not doc.content:
            messages.append(SanityWarning(
--- a/src/documents/settings.py
+++ b/src/documents/settings.py
@ -2,3 +2,4 @@
 # for exporting/importing commands
 EXPORTER_FILE_NAME = "__exported_file_name__"
 EXPORTER_THUMBNAIL_NAME = "__exported_thumbnail_name__"
+EXPORTER_ARCHIVE_NAME = "__exported_archive_name__"
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@ -13,7 +13,7 @@ from rest_framework.reverse import reverse

 from .. import index, matching
 from ..file_handling import delete_empty_directories, generate_filename, \
-    create_source_path_directory
+    create_source_path_directory, archive_name_from_filename
 from ..models import Document, Tag


@ -169,13 +169,46 @@ def run_post_consume_script(sender, document, **kwargs):

@receiver(models.signals.post_delete, sender=Document)
 def cleanup_document_deletion(sender, instance, using, **kwargs):
-    for f in (instance.source_path, instance.thumbnail_path):
-        try:
-            os.unlink(f)
-        except FileNotFoundError:
-            pass  # The file's already gone, so we're cool with it.
+    for f in (instance.source_path,
+              instance.archive_path,
+              instance.thumbnail_path):
+        if os.path.isfile(f):
+            try:
+                os.unlink(f)
+                logging.getLogger(__name__).debug(
+                    f"Deleted file {f}.")
+            except OSError as e:
+                logging.getLogger(__name__).warning(
+                    f"While deleting document {instance.file_name}, the file "
+                    f"{f} could not be deleted: {e}"
+                )

-    delete_empty_directories(os.path.dirname(instance.source_path))
+    delete_empty_directories(
+        os.path.dirname(instance.source_path),
+        root=settings.ORIGINALS_DIR
+    )
+
+    delete_empty_directories(
+        os.path.dirname(instance.archive_path),
+        root=settings.ARCHIVE_DIR
+    )
+
+
+def validate_move(instance, old_path, new_path):
+    if not os.path.isfile(old_path):
+        # Can't do anything if the old file does not exist anymore.
+        logging.getLogger(__name__).fatal(
+            f"Document {str(instance)}: File {old_path} has gone.")
+        return False
+
+    if os.path.isfile(new_path):
+        # Can't do anything if the new file already exists. Skip updating file.
+        logging.getLogger(__name__).warning(
+            f"Document {str(instance)}: Cannot rename file "
+            f"since target path {new_path} already exists.")
+        return False
+
+    return True


@receiver(models.signals.m2m_changed, sender=Document.tags.through)
@ -183,55 +216,90 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
 def update_filename_and_move_files(sender, instance, **kwargs):

    if not instance.filename:
-        # Can't update the filename if there is not filename to begin with
-        # This happens after the consumer creates a new document.
-        # The PK needs to be set first by saving the document once. When this
-        # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be
-        # renamed anyway. In all other cases, instance.filename will be set.
+        # Can't update the filename if there is no filename to begin with
+        # This happens when the consumer creates a new document.
+        # The document is modified and saved multiple times, and only after
+        # everything is done (i.e., the generated filename is final),
+        # filename will be set to the location where the consumer has put
+        # the file.
+        #
+        # This will in turn cause this logic to move the file where it belongs.
        return

    old_filename = instance.filename
-    old_path = instance.source_path
    new_filename = generate_filename(instance)

    if new_filename == instance.filename:
        # Don't do anything if its the same.
        return

-    new_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
+    old_source_path = instance.source_path
+    new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)

-    if not os.path.isfile(old_path):
-        # Can't do anything if the old file does not exist anymore.
-        logging.getLogger(__name__).fatal(
-            f"Document {str(instance)}: File {old_path} has gone.")
+    if not validate_move(instance, old_source_path, new_source_path):
        return

-    if os.path.isfile(new_path):
-        # Can't do anything if the new file already exists. Skip updating file.
-        logging.getLogger(__name__).warning(
-            f"Document {str(instance)}: Cannot rename file "
-            f"since target path {new_path} already exists.")
-        return
+    # archive files are optional, archive checksum tells us if we have one,
+    # since this is None for documents without archived files.
+    if instance.archive_checksum:
+        new_archive_filename = archive_name_from_filename(new_filename)
+        old_archive_path = instance.archive_path
+        new_archive_path = os.path.join(settings.ARCHIVE_DIR,
+                                        new_archive_filename)

-    create_source_path_directory(new_path)
+        if not validate_move(instance, old_archive_path, new_archive_path):
+            return
+
+        create_source_path_directory(new_archive_path)
+    else:
+        old_archive_path = None
+        new_archive_path = None
+
+    create_source_path_directory(new_source_path)

    try:
-        os.rename(old_path, new_path)
+        os.rename(old_source_path, new_source_path)
+        if instance.archive_checksum:
+            os.rename(old_archive_path, new_archive_path)
        instance.filename = new_filename
        # Don't save here to prevent infinite recursion.
        Document.objects.filter(pk=instance.pk).update(filename=new_filename)

        logging.getLogger(__name__).debug(
-            f"Moved file {old_path} to {new_path}.")
+            f"Moved file {old_source_path} to {new_source_path}.")
+
+        logging.getLogger(__name__).debug(
+            f"Moved file {old_archive_path} to {new_archive_path}.")

    except OSError as e:
        instance.filename = old_filename
+        # this happens when we can't move a file. If that's the case for the
+        # archive file, we try our best to revert the changes.
+        try:
+            os.rename(new_source_path, old_source_path)
+            os.rename(new_archive_path, old_archive_path)
+        except:
+            # This is fine, since:
+            # A: if we managed to move source from A to B, we will also manage
+            #  to move it from B to A. If not, we have a serious issue
+            #  that's going to get caught by the santiy checker.
+            #  all files remain in place and will never be overwritten,
+            #  so this is not the end of the world.
+            # B: if moving the orignal file failed, nothing has changed anyway.
+            pass
    except DatabaseError as e:
-        os.rename(new_path, old_path)
+        os.rename(new_source_path, old_source_path)
+        if instance.archive_checksum:
+            os.rename(new_archive_path, old_archive_path)
        instance.filename = old_filename

-    if not os.path.isfile(old_path):
-        delete_empty_directories(os.path.dirname(old_path))
+    if not os.path.isfile(old_source_path):
+        delete_empty_directories(os.path.dirname(old_source_path),
+                                 root=settings.ORIGINALS_DIR)
+
+    if old_archive_path and not os.path.isfile(old_archive_path):
+        delete_empty_directories(os.path.dirname(old_archive_path),
+                                 root=settings.ARCHIVE_DIR)


 def set_log_entry(sender, document=None, logging_group=None, **kwargs):
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@ -12,7 +12,9 @@ from documents.sanity_checker import SanityFailedError


 def index_optimize():
-    index.open_index().optimize()
+    ix = index.open_index()
+    writer = AsyncWriter(ix)
+    writer.commit(optimize=True)


 def index_reindex():
--- a/src/paperless_tesseract/tests/samples/no-text.png
+++ b/src/paperless_tesseract/tests/samples/no-text.png
--- a/src/documents/tests/samples/documents/archive/0000001.pdf
+++ b/src/documents/tests/samples/documents/archive/0000001.pdf
--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@ -100,6 +100,44 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        self.assertEqual(response.status_code, 200)
        self.assertEqual(response.content, content_thumbnail)

+    def test_download_with_archive(self):
+
+        _, filename = tempfile.mkstemp(dir=self.dirs.originals_dir)
+
+        content = b"This is a test"
+        content_archive = b"This is the same test but archived"
+
+        with open(filename, "wb") as f:
+            f.write(content)
+
+        filename = os.path.basename(filename)
+
+        doc = Document.objects.create(title="none", filename=filename,
+                                      mime_type="application/pdf")
+
+        with open(doc.archive_path, "wb") as f:
+            f.write(content_archive)
+
+        response = self.client.get('/api/documents/{}/download/'.format(doc.pk))
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, content_archive)
+
+        response = self.client.get('/api/documents/{}/download/?original=true'.format(doc.pk))
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, content)
+
+        response = self.client.get('/api/documents/{}/preview/'.format(doc.pk))
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, content_archive)
+
+        response = self.client.get('/api/documents/{}/preview/?original=true'.format(doc.pk))
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(response.content, content)
+
    def test_document_actions_not_existing_file(self):

        doc = Document.objects.create(title="none", filename=os.path.basename("asd"), mime_type="application/pdf")
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@ -1,5 +1,6 @@
 import os
 import re
+import shutil
 import tempfile
 from unittest import mock
 from unittest.mock import MagicMock
@ -364,35 +365,36 @@ class TestFieldPermutations(TestCase):

 class DummyParser(DocumentParser):

-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        # not important during tests
        raise NotImplementedError()

-    def __init__(self, path, logging_group, scratch_dir):
-        super(DummyParser, self).__init__(path, logging_group)
+    def __init__(self, logging_group, scratch_dir, archive_path):
+        super(DummyParser, self).__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
+        self.archive_path = archive_path

-    def get_optimised_thumbnail(self):
+    def get_optimised_thumbnail(self, document_path, mime_type):
        return self.fake_thumb

-    def get_text(self):
-        return "The Text"
+    def parse(self, document_path, mime_type):
+        self.text = "The Text"


 class FaultyParser(DocumentParser):

-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        # not important during tests
        raise NotImplementedError()

-    def __init__(self, path, logging_group, scratch_dir):
-        super(FaultyParser, self).__init__(path, logging_group)
+    def __init__(self, logging_group, scratch_dir):
+        super(FaultyParser, self).__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)

-    def get_optimised_thumbnail(self):
+    def get_optimised_thumbnail(self, document_path, mime_type):
        return self.fake_thumb

-    def get_text(self):
+    def parse(self, document_path, mime_type):
        raise ParseError("Does not compute.")


@ -410,11 +412,11 @@ def fake_magic_from_file(file, mime=False):
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
 class TestConsumer(DirectoriesMixin, TestCase):

-    def make_dummy_parser(self, path, logging_group):
-        return DummyParser(path, logging_group, self.dirs.scratch_dir)
+    def make_dummy_parser(self, logging_group):
+        return DummyParser(logging_group, self.dirs.scratch_dir, self.get_test_archive_file())

-    def make_faulty_parser(self, path, logging_group):
-        return FaultyParser(path, logging_group, self.dirs.scratch_dir)
+    def make_faulty_parser(self, logging_group):
+        return FaultyParser(logging_group, self.dirs.scratch_dir)

    def setUp(self):
        super(TestConsumer, self).setUp()
@ -432,8 +434,16 @@ class TestConsumer(DirectoriesMixin, TestCase):
        self.consumer = Consumer()

    def get_test_file(self):
-        fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.dirs.scratch_dir)
-        return f
+        src = os.path.join(os.path.dirname(__file__), "samples", "documents", "originals", "0000001.pdf")
+        dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
+        shutil.copy(src, dst)
+        return dst
+
+    def get_test_archive_file(self):
+        src = os.path.join(os.path.dirname(__file__), "samples", "documents", "archive", "0000001.pdf")
+        dst = os.path.join(self.dirs.scratch_dir, "sample_archive.pdf")
+        shutil.copy(src, dst)
+        return dst

    @override_settings(PAPERLESS_FILENAME_FORMAT=None)
    def testNormalOperation(self):
@ -455,6 +465,13 @@ class TestConsumer(DirectoriesMixin, TestCase):
            document.thumbnail_path
        ))

+        self.assertTrue(os.path.isfile(
+            document.archive_path
+        ))
+
+        self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
+        self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
+
        self.assertFalse(os.path.isfile(filename))

    def testOverrideFilename(self):
@ -502,7 +519,7 @@ class TestConsumer(DirectoriesMixin, TestCase):

        self.fail("Should throw exception")

-    def testDuplicates(self):
+    def testDuplicates1(self):
        self.consumer.try_consume_file(self.get_test_file())

        try:
@ -513,6 +530,21 @@ class TestConsumer(DirectoriesMixin, TestCase):

        self.fail("Should throw exception")

+    def testDuplicates2(self):
+        self.consumer.try_consume_file(self.get_test_file())
+
+        try:
+            self.consumer.try_consume_file(self.get_test_archive_file())
+        except ConsumerError as e:
+            self.assertTrue(str(e).endswith("It is a duplicate."))
+            return
+
+        self.fail("Should throw exception")
+
+    def testDuplicates3(self):
+        self.consumer.try_consume_file(self.get_test_archive_file())
+        self.consumer.try_consume_file(self.get_test_file())
+
    @mock.patch("documents.parsers.document_consumer_declaration.send")
    def testNoParsers(self, m):
        m.return_value = []
--- a/src/documents/tests/test_date_parsing.py
+++ b/src/documents/tests/test_date_parsing.py
@ -0,0 +1,140 @@
+import datetime
+import os
+import shutil
+from unittest import mock
+from uuid import uuid4
+
+from dateutil import tz
+from django.conf import settings
+from django.test import TestCase, override_settings
+
+from documents.parsers import parse_date
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class TestDate(TestCase):
+
+    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "../../paperless_tesseract/tests/samples")
+    SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
+
+    def setUp(self):
+        os.makedirs(self.SCRATCH, exist_ok=True)
+
+    def tearDown(self):
+        shutil.rmtree(self.SCRATCH)
+
+    def test_date_format_1(self):
+        text = "lorem ipsum 130218 lorem ipsum"
+        self.assertEqual(parse_date("", text), None)
+
+    def test_date_format_2(self):
+        text = "lorem ipsum 2018 lorem ipsum"
+        self.assertEqual(parse_date("", text), None)
+
+    def test_date_format_3(self):
+        text = "lorem ipsum 20180213 lorem ipsum"
+        self.assertEqual(parse_date("", text), None)
+
+    def test_date_format_4(self):
+        text = "lorem ipsum 13.02.2018 lorem ipsum"
+        date = parse_date("", text)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2018, 2, 13, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    def test_date_format_5(self):
+        text = (
+            "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
+            "ipsum"
+        )
+        date = parse_date("", text)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2018, 2, 13, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    def test_date_format_6(self):
+        text = (
+            "lorem ipsum\n"
+            "Wohnort\n"
+            "3100\n"
+            "IBAN\n"
+            "AT87 4534\n"
+            "1234\n"
+            "1234 5678\n"
+            "BIC\n"
+            "lorem ipsum"
+        )
+        self.assertEqual(parse_date("", text), None)
+
+    def test_date_format_7(self):
+        text = (
+            "lorem ipsum\n"
+            "März 2019\n"
+            "lorem ipsum"
+        )
+        date = parse_date("", text)
+        self.assertEqual(
+            date,
+            datetime.datetime(
+                2019, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    def test_date_format_8(self):
+        text = (
+            "lorem ipsum\n"
+            "Wohnort\n"
+            "3100\n"
+            "IBAN\n"
+            "AT87 4534\n"
+            "1234\n"
+            "1234 5678\n"
+            "BIC\n"
+            "lorem ipsum\n"
+            "März 2020"
+        )
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(
+                2020, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    @override_settings(SCRATCH_DIR=SCRATCH)
+    def test_date_format_9(self):
+        text = (
+            "lorem ipsum\n"
+            "27. Nullmonth 2020\n"
+            "März 2020\n"
+            "lorem ipsum"
+        )
+        self.assertEqual(
+            parse_date("", text),
+            datetime.datetime(
+                2020, 3, 1, 0, 0,
+                tzinfo=tz.gettz(settings.TIME_ZONE)
+            )
+        )
+
+    def test_crazy_date_past(self, *args):
+        self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
+
+    def test_crazy_date_future(self, *args):
+        self.assertIsNone(parse_date("", "01-07-2350 00:00:00"))
+
+    def test_crazy_date_with_spaces(self, *args):
+        self.assertIsNone(parse_date("", "20 408000l 2475"))
+
+    @override_settings(FILENAME_DATE_ORDER="YMD")
+    def test_filename_date_parse_invalid(self, *args):
+        self.assertIsNone(parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"))
--- a/src/documents/tests/test_document_model.py
+++ b/src/documents/tests/test_document_model.py
@ -1,12 +1,29 @@
+import os
+import shutil
+import tempfile
+from pathlib import Path
 from unittest import mock

-from django.test import TestCase
+from django.test import TestCase, override_settings

 from ..models import Document, Correspondent


 class TestDocument(TestCase):

+    def setUp(self) -> None:
+        self.originals_dir = tempfile.mkdtemp()
+        self.thumb_dir = tempfile.mkdtemp()
+
+        override_settings(
+            ORIGINALS_DIR=self.originals_dir,
+            THUMBNAIL_DIR=self.thumb_dir,
+        ).enable()
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.originals_dir)
+        shutil.rmtree(self.thumb_dir)
+
    def test_file_deletion(self):
        document = Document.objects.create(
            correspondent=Correspondent.objects.create(name="Test0"),
@ -19,6 +36,9 @@ class TestDocument(TestCase):
        file_path = document.source_path
        thumb_path = document.thumbnail_path

+        Path(file_path).touch()
+        Path(thumb_path).touch()
+
        with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
            document.delete()
            mock_unlink.assert_any_call(file_path)
--- a/src/documents/tests/test_file_handling.py
+++ b/src/documents/tests/test_file_handling.py
@ -2,32 +2,17 @@ import os
 import shutil
 from pathlib import Path
 from unittest import mock
-from uuid import uuid4

 from django.conf import settings
 from django.db import DatabaseError
 from django.test import TestCase, override_settings

+from .utils import DirectoriesMixin
 from ..file_handling import generate_filename, create_source_path_directory, delete_empty_directories
 from ..models import Document, Correspondent


-class TestDate(TestCase):
-    deletion_list = []
-
-    def add_to_deletion_list(self, dirname):
-        self.deletion_list.append(dirname)
-
-    def setUp(self):
-        folder = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
-        os.makedirs(folder + "/documents/originals")
-        override_settings(MEDIA_ROOT=folder).enable()
-        override_settings(ORIGINALS_DIR=folder + "/documents/originals").enable()
-        self.add_to_deletion_list(folder)
-
-    def tearDown(self):
-        for dirname in self.deletion_list:
-            shutil.rmtree(dirname, ignore_errors=True)
+class TestFileHandling(DirectoriesMixin, TestCase):

    @override_settings(PAPERLESS_FILENAME_FORMAT="")
    def test_generate_source_filename(self):
@ -104,7 +89,7 @@ class TestDate(TestCase):
        document.save()

        # Check proper handling of files
-        self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
+        self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
        self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))

        os.chmod(settings.ORIGINALS_DIR + "/none", 0o777)
@ -140,7 +125,7 @@ class TestDate(TestCase):

            # Check proper handling of files
            self.assertTrue(os.path.isfile(document.source_path))
-            self.assertEqual(os.path.isfile(settings.MEDIA_ROOT + "/documents/originals/none/none-{:07d}.pdf".format(document.pk)), True)
+            self.assertEqual(os.path.isfile(settings.ORIGINALS_DIR + "/none/none-{:07d}.pdf".format(document.pk)), True)
            self.assertEqual(document.filename, "none/none-{:07d}.pdf".format(document.pk))

    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{correspondent}")
@ -196,8 +181,8 @@ class TestDate(TestCase):
        document.save()

        # Check proper handling of files
-        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/test"), True)
-        self.assertEqual(os.path.isdir(settings.MEDIA_ROOT + "/documents/originals/none"), True)
+        self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/test"), True)
+        self.assertEqual(os.path.isdir(settings.ORIGINALS_DIR + "/none"), True)
        self.assertTrue(os.path.isfile(important_file))

    @override_settings(PAPERLESS_FILENAME_FORMAT="{tags[type]}")
@ -315,13 +300,12 @@ class TestDate(TestCase):
        # Create our working directory
        tmp = os.path.join(settings.ORIGINALS_DIR, "test_delete_empty")
        os.makedirs(tmp)
-        self.add_to_deletion_list(tmp)

        os.makedirs(os.path.join(tmp, "notempty"))
        Path(os.path.join(tmp, "notempty", "file")).touch()
        os.makedirs(os.path.join(tmp, "notempty", "empty"))

-        delete_empty_directories(os.path.join(tmp, "notempty", "empty"))
+        delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
        self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
        self.assertEqual(os.path.isfile(
            os.path.join(tmp, "notempty", "file")), True)
@ -345,3 +329,159 @@ class TestDate(TestCase):
        document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED

        self.assertEqual(generate_filename(document), "0000001.pdf")
+
+
+class TestFileHandlingWithArchive(DirectoriesMixin, TestCase):
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT=None)
+    def test_create_no_format(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_create_with_format(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertFalse(os.path.isfile(original))
+        self.assertFalse(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+        self.assertEqual(doc.source_path, os.path.join(settings.ORIGINALS_DIR, "none", "my_doc-0000001.pdf"))
+        self.assertEqual(doc.archive_path, os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf"))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_move_archive_gone(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        #Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertFalse(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertFalse(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_move_archive_exists(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        os.makedirs(os.path.join(settings.ARCHIVE_DIR, "none"))
+        Path(os.path.join(settings.ARCHIVE_DIR, "none", "my_doc-0000001.pdf")).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    @mock.patch("documents.signals.handlers.os.rename")
+    def test_move_archive_error(self, m):
+
+        def fake_rename(src, dst):
+            if "archive" in src:
+                raise OSError()
+            else:
+                os.remove(src)
+                Path(dst).touch()
+
+        m.side_effect = fake_rename
+
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_move_file_gone(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        #Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertFalse(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertFalse(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    @mock.patch("documents.signals.handlers.os.rename")
+    def test_move_file_error(self, m):
+
+        def fake_rename(src, dst):
+            if "original" in src:
+                raise OSError()
+            else:
+                os.remove(src)
+                Path(dst).touch()
+
+        m.side_effect = fake_rename
+
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+    def test_archive_deleted(self):
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document.objects.create(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
+
+        doc.delete()
+
+        self.assertFalse(os.path.isfile(original))
+        self.assertFalse(os.path.isfile(archive))
+        self.assertFalse(os.path.isfile(doc.source_path))
+        self.assertFalse(os.path.isfile(doc.archive_path))
+
+    @override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
+    def test_database_error(self):
+
+        original = os.path.join(settings.ORIGINALS_DIR, "0000001.pdf")
+        archive = os.path.join(settings.ARCHIVE_DIR, "0000001.pdf")
+        Path(original).touch()
+        Path(archive).touch()
+        doc = Document(mime_type="application/pdf", title="my_doc", filename="0000001.pdf", checksum="A", archive_checksum="B")
+        with mock.patch("documents.signals.handlers.Document.objects.filter") as m:
+            m.side_effect = DatabaseError()
+            doc.save()
+
+        self.assertTrue(os.path.isfile(original))
+        self.assertTrue(os.path.isfile(archive))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(os.path.isfile(doc.archive_path))
--- a/src/documents/tests/test_management_archiver.py
+++ b/src/documents/tests/test_management_archiver.py
@ -0,0 +1,42 @@
+import filecmp
+import os
+import shutil
+
+from django.core.management import call_command
+from django.test import TestCase
+
+from documents.management.commands.document_archiver import handle_document
+from documents.models import Document
+from documents.tests.utils import DirectoriesMixin
+
+
+sample_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
+
+
+class TestArchiver(DirectoriesMixin, TestCase):
+
+    def make_models(self):
+        self.d1 = Document.objects.create(checksum="A", title="A", content="first document", pk=1, mime_type="application/pdf")
+        #self.d2 = Document.objects.create(checksum="B", title="B", content="second document")
+        #self.d3 = Document.objects.create(checksum="C", title="C", content="unrelated document")
+
+    def test_archiver(self):
+
+        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
+        self.make_models()
+
+        call_command('document_archiver')
+
+    def test_handle_document(self):
+
+        shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf"))
+        self.make_models()
+
+        handle_document(self.d1)
+
+        doc = Document.objects.get(id=self.d1.id)
+
+        self.assertIsNotNone(doc.checksum)
+        self.assertTrue(os.path.isfile(doc.archive_path))
+        self.assertTrue(os.path.isfile(doc.source_path))
+        self.assertTrue(filecmp.cmp(sample_file, doc.source_path))
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@ -23,10 +23,7 @@ class TestExporter(DirectoriesMixin, TestCase):

        file = os.path.join(self.dirs.originals_dir, "0000001.pdf")

-        with open(file, "rb") as f:
-            checksum = hashlib.md5(f.read()).hexdigest()
-
-        Document.objects.create(checksum=checksum, title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
+        Document.objects.create(checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf")
        Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
        Tag.objects.create(name="t")
        DocumentType.objects.create(name="dt")
@ -51,6 +48,14 @@ class TestExporter(DirectoriesMixin, TestCase):
                    checksum = hashlib.md5(f.read()).hexdigest()
                self.assertEqual(checksum, element['fields']['checksum'])

+                if document_exporter.EXPORTER_ARCHIVE_NAME in element:
+                    fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
+                    self.assertTrue(os.path.exists(fname))
+
+                    with open(fname, "rb") as f:
+                        checksum = hashlib.md5(f.read()).hexdigest()
+                    self.assertEqual(checksum, element['fields']['archive_checksum'])
+
        Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf")

        self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target)
--- a/src/documents/tests/test_parsers.py
+++ b/src/documents/tests/test_parsers.py
@ -1,11 +1,13 @@
 import os
+import shutil
+import tempfile
 from tempfile import TemporaryDirectory
 from unittest import mock

-from django.test import TestCase
+from django.test import TestCase, override_settings

 from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
-    get_parser_class_for_mime_type
+    get_parser_class_for_mime_type, DocumentParser
 from paperless_tesseract.parsers import RasterisedDocumentParser
 from paperless_text.parsers import TextDocumentParser

@ -66,6 +68,38 @@ class TestParserDiscovery(TestCase):
            )


+def fake_get_thumbnail(self, path, mimetype):
+    return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
+
+
+class TestBaseParser(TestCase):
+
+    def setUp(self) -> None:
+
+        self.scratch = tempfile.mkdtemp()
+        override_settings(
+            SCRATCH_DIR=self.scratch
+        ).enable()
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.scratch)
+
+    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
+    @override_settings(OPTIMIZE_THUMBNAILS=True)
+    def test_get_optimised_thumbnail(self):
+        parser = DocumentParser(None)
+
+        parser.get_optimised_thumbnail("any", "not important")
+
+    @mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
+    @override_settings(OPTIMIZE_THUMBNAILS=False)
+    def test_get_optimised_thumb_disabled(self):
+        parser = DocumentParser(None)
+
+        path = parser.get_optimised_thumbnail("any", "not important")
+        self.assertEqual(path, fake_get_thumbnail(None, None, None))
+
+
 class TestParserAvailability(TestCase):

    def test_file_extensions(self):
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@ -17,10 +17,12 @@ def setup_directories():
    dirs.index_dir = os.path.join(dirs.data_dir, "index")
    dirs.originals_dir = os.path.join(dirs.media_dir, "documents", "originals")
    dirs.thumbnail_dir = os.path.join(dirs.media_dir, "documents", "thumbnails")
+    dirs.archive_dir = os.path.join(dirs.media_dir, "documents", "archive")

    os.makedirs(dirs.index_dir, exist_ok=True)
    os.makedirs(dirs.originals_dir, exist_ok=True)
    os.makedirs(dirs.thumbnail_dir, exist_ok=True)
+    os.makedirs(dirs.archive_dir, exist_ok=True)

    override_settings(
        DATA_DIR=dirs.data_dir,
@ -28,6 +30,7 @@ def setup_directories():
        MEDIA_ROOT=dirs.media_dir,
        ORIGINALS_DIR=dirs.originals_dir,
        THUMBNAIL_DIR=dirs.thumbnail_dir,
+        ARCHIVE_DIR=dirs.archive_dir,
        CONSUMPTION_DIR=dirs.consumption_dir,
        INDEX_DIR=dirs.index_dir,
        MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle")
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -1,3 +1,5 @@
+import os
+
 from django.db.models import Count, Max
 from django.http import HttpResponse, HttpResponseBadRequest, Http404
 from django.views.decorators.cache import cache_control
@ -126,17 +128,30 @@ class DocumentViewSet(RetrieveModelMixin,
        index.remove_document_from_index(self.get_object())
        return super(DocumentViewSet, self).destroy(request, *args, **kwargs)

-    def file_response(self, pk, disposition):
+    @staticmethod
+    def original_requested(request):
+        return (
+            'original' in request.query_params and
+            request.query_params['original'] == 'true'
+        )
+
+    def file_response(self, pk, request, disposition):
        doc = Document.objects.get(id=pk)
-
-        if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
-            file_handle = doc.source_file
+        if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501
+            file_handle = doc.archive_file
+            filename = doc.archive_file_name
+            mime_type = 'application/pdf'
        else:
-            file_handle = GnuPG.decrypted(doc.source_file)
+            file_handle = doc.source_file
+            filename = doc.file_name
+            mime_type = doc.mime_type

-        response = HttpResponse(file_handle, content_type=doc.mime_type)
+        if doc.storage_type == Document.STORAGE_TYPE_GPG:
+            file_handle = GnuPG.decrypted(file_handle)
+
+        response = HttpResponse(file_handle, content_type=mime_type)
        response["Content-Disposition"] = '{}; filename="{}"'.format(
-            disposition, doc.file_name)
+            disposition, filename)
        return response

    @action(methods=['post'], detail=False)
@ -157,6 +172,8 @@ class DocumentViewSet(RetrieveModelMixin,
                "paperless__checksum": doc.checksum,
                "paperless__mime_type": doc.mime_type,
                "paperless__filename": doc.filename,
+                "paperless__has_archive_version":
+                    os.path.isfile(doc.archive_path)
            })
        except Document.DoesNotExist:
            raise Http404()
@ -164,7 +181,8 @@ class DocumentViewSet(RetrieveModelMixin,
    @action(methods=['get'], detail=True)
    def preview(self, request, pk=None):
        try:
-            response = self.file_response(pk, "inline")
+            response = self.file_response(
+                pk, request, "inline")
            return response
        except (FileNotFoundError, Document.DoesNotExist):
            raise Http404()
@ -181,7 +199,8 @@ class DocumentViewSet(RetrieveModelMixin,
    @action(methods=['get'], detail=True)
    def download(self, request, pk=None):
        try:
-            return self.file_response(pk, "attachment")
+            return self.file_response(
+                pk, request, "attachment")
        except (FileNotFoundError, Document.DoesNotExist):
            raise Http404()

--- a/src/paperless/checks.py
+++ b/src/paperless/checks.py
@ -57,7 +57,6 @@ def binaries_check(app_configs, **kwargs):
    binaries = (
        settings.CONVERT_BINARY,
        settings.OPTIPNG_BINARY,
-        settings.UNPAPER_BINARY,
        "tesseract"
    )

--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta

 MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media"))
 ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals")
+ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
 THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")

 DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data"))
@ -348,9 +349,17 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0))
 # documents.  It should be a 3-letter language code consistent with ISO 639.
 OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")

+# OCRmyPDF --output-type options are available.
+# TODO: validate this setting.
+OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")

-# OCR all documents?
-OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false")
+# skip. redo, force
+# TODO: validate this.
+OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
+
+OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
+
+OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")

 # GNUPG needs a home directory for some reason
 GNUPG_HOME = os.getenv("HOME", "/tmp")
@ -359,11 +368,10 @@ GNUPG_HOME = os.getenv("HOME", "/tmp")
 CONVERT_BINARY = os.getenv("PAPERLESS_CONVERT_BINARY", "convert")
 CONVERT_TMPDIR = os.getenv("PAPERLESS_CONVERT_TMPDIR")
 CONVERT_MEMORY_LIMIT = os.getenv("PAPERLESS_CONVERT_MEMORY_LIMIT")
-CONVERT_DENSITY = int(os.getenv("PAPERLESS_CONVERT_DENSITY", 300))

 GS_BINARY = os.getenv("PAPERLESS_GS_BINARY", "gs")
+
 OPTIPNG_BINARY = os.getenv("PAPERLESS_OPTIPNG_BINARY", "optipng")
-UNPAPER_BINARY = os.getenv("PAPERLESS_UNPAPER_BINARY", "unpaper")


 # Pre-2.x versions of Paperless stored your documents locally with GPG
--- a/src/paperless_tesseract/checks.py
+++ b/src/paperless_tesseract/checks.py
@ -14,12 +14,21 @@ def get_tesseract_langs():

@register()
 def check_default_language_available(app_configs, **kwargs):
-    langs = get_tesseract_langs()
+    installed_langs = get_tesseract_langs()

-    if settings.OCR_LANGUAGE not in langs:
-        return [Error(
-            f"The default ocr language {settings.OCR_LANGUAGE} is "
-            f"not installed. Paperless cannot OCR your documents "
-            f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
-    else:
-        return []
+    if not settings.OCR_LANGUAGE:
+        return [Warning(
+            "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
+            "This means that tesseract will fallback to english."
+        )]
+
+    specified_langs = settings.OCR_LANGUAGE.split("+")
+
+    for lang in specified_langs:
+        if lang not in installed_langs:
+            return [Error(
+                f"The selected ocr language {lang} is "
+                f"not installed. Paperless cannot OCR your documents "
+                f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")]
+
+    return []
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -1,23 +1,15 @@
-import itertools
+import json
 import os
 import re
 import subprocess
-from multiprocessing.pool import ThreadPool

-import langdetect
+import ocrmypdf
 import pdftotext
-import pyocr
 from PIL import Image
 from django.conf import settings
-from pyocr import PyocrException
+from ocrmypdf import InputFileError

-from documents.parsers import DocumentParser, ParseError, run_unpaper, \
-    run_convert
-from .languages import ISO639
-
-
-class OCRError(Exception):
-    pass
+from documents.parsers import DocumentParser, ParseError, run_convert


 class RasterisedDocumentParser(DocumentParser):
@ -26,11 +18,7 @@ class RasterisedDocumentParser(DocumentParser):
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """

-    def __init__(self, path, logging_group):
-        super().__init__(path, logging_group)
-        self._text = None
-
-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """
@ -44,7 +32,7 @@ class RasterisedDocumentParser(DocumentParser):
                        alpha="remove",
                        strip=True,
                        trim=True,
-                        input_file="{}[0]".format(self.document_path),
+                        input_file="{}[0]".format(document_path),
                        output_file=out_path,
                        logging_group=self.logging_group)
        except ParseError:
@ -59,7 +47,7 @@ class RasterisedDocumentParser(DocumentParser):
                   "-q",
                   "-sDEVICE=pngalpha",
                   "-o", gs_out_path,
-                   self.document_path]
+                   document_path]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
@ -74,169 +62,126 @@ class RasterisedDocumentParser(DocumentParser):

        return out_path

-    def _is_ocred(self):
-
-        # Extract text from PDF using pdftotext
-        text = get_text_from_pdf(self.document_path)
-
-        # We assume, that a PDF with at least 50 characters contains text
-        # (so no OCR required)
-        return len(text) > 50
-
-    def get_text(self):
-
-        if self._text is not None:
-            return self._text
-
-        if not settings.OCR_ALWAYS and self._is_ocred():
-            self.log("debug", "Skipping OCR, using Text from PDF")
-            self._text = get_text_from_pdf(self.document_path)
-            return self._text
-
-        images = self._get_greyscale()
-
-        if not images:
-            raise ParseError("Empty document, nothing to do.")
+    def is_image(self, mime_type):
+        return mime_type in [
+            "image/png",
+            "image/jpeg"
+        ]

+    def get_dpi(self, image):
        try:
-
-            sample_page_index = int(len(images) / 2)
-            self.log(
-                "debug",
-                f"Attempting language detection on page "
-                f"{sample_page_index + 1} of {len(images)}...")
-
-            sample_page_text = self._ocr([images[sample_page_index]],
-                                         settings.OCR_LANGUAGE)[0]
-            guessed_language = self._guess_language(sample_page_text)
-
-            if not guessed_language or guessed_language not in ISO639:
-                self.log("warning", "Language detection failed.")
-                ocr_pages = self._complete_ocr_default_language(
-                    images, sample_page_index, sample_page_text)
-
-            elif ISO639[guessed_language] == settings.OCR_LANGUAGE:
-                self.log(
-                    "debug",
-                    f"Detected language: {guessed_language} "
-                    f"(default language)")
-                ocr_pages = self._complete_ocr_default_language(
-                    images, sample_page_index, sample_page_text)
-
-            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501
-                self.log(
-                    "warning",
-                    f"Detected language {guessed_language} is not available "
-                    f"on this system.")
-                ocr_pages = self._complete_ocr_default_language(
-                    images, sample_page_index, sample_page_text)
-
-            else:
-                self.log("debug", f"Detected language: {guessed_language}")
-                ocr_pages = self._ocr(images, ISO639[guessed_language])
-
-            self.log("debug", "OCR completed.")
-            self._text = strip_excess_whitespace(" ".join(ocr_pages))
-            return self._text
-
-        except OCRError as e:
-            raise ParseError(e)
-
-    def _get_greyscale(self):
-        """
-        Greyscale images are easier for Tesseract to OCR
-        """
-
-        # Convert PDF to multiple PNMs
-        input_file = self.document_path
-
-        if settings.OCR_PAGES == 1:
-            input_file += "[0]"
-        elif settings.OCR_PAGES > 1:
-            input_file += f"[0-{settings.OCR_PAGES - 1}]"
-
-        self.log(
-            "debug",
-            f"Converting document {input_file} into greyscale images")
-
-        output_files = os.path.join(self.tempdir, "convert-%04d.pnm")
-
-        run_convert(density=settings.CONVERT_DENSITY,
-                    depth="8",
-                    type="grayscale",
-                    input_file=input_file,
-                    output_file=output_files,
-                    logging_group=self.logging_group)
-
-        # Get a list of converted images
-        pnms = []
-        for f in os.listdir(self.tempdir):
-            if f.endswith(".pnm"):
-                pnms.append(os.path.join(self.tempdir, f))
-
-        self.log("debug", f"Running unpaper on {len(pnms)} pages...")
-
-        # Run unpaper in parallel on converted images
-        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
-            pnms = pool.map(run_unpaper, pnms)
-
-        return sorted(filter(lambda __: os.path.isfile(__), pnms))
-
-    def _guess_language(self, text):
-        try:
-            guess = langdetect.detect(text)
-            return guess
+            with Image.open(image) as im:
+                x, y = im.info['dpi']
+                return x
        except Exception as e:
-            self.log('warning', f"Language detection failed with: {e}")
+            self.log(
+                'warning',
+                f"Error while getting DPI from image {image}: {e}")
            return None

-    def _ocr(self, imgs, lang):
-        self.log(
-            "debug",
-            f"Performing OCR on {len(imgs)} page(s) with language {lang}")
-        with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool:
-            r = pool.map(image_to_string, itertools.product(imgs, [lang]))
-            return r
+    def parse(self, document_path, mime_type):
+        if settings.OCR_MODE == "skip_noarchive":
+            text = get_text_from_pdf(document_path)
+            if text and len(text) > 50:
+                self.text = text
+                return

-    def _complete_ocr_default_language(self,
-                                       images,
-                                       sample_page_index,
-                                       sample_page):
-        images_copy = list(images)
-        del images_copy[sample_page_index]
-        if images_copy:
-            self.log('debug', "Continuing ocr with default language.")
-            ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE)
-            ocr_pages.insert(sample_page_index, sample_page)
-            return ocr_pages
-        else:
-            return [sample_page]
+        archive_path = os.path.join(self.tempdir, "archive.pdf")
+
+        ocr_args = {
+            'input_file': document_path,
+            'output_file': archive_path,
+            'use_threads': True,
+            'jobs': settings.THREADS_PER_WORKER,
+            'language': settings.OCR_LANGUAGE,
+            'output_type': settings.OCR_OUTPUT_TYPE,
+            'progress_bar': False,
+            'clean': True
+        }
+
+        if settings.OCR_PAGES > 0:
+            ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
+
+        if settings.OCR_MODE in ['skip', 'skip_noarchive']:
+            ocr_args['skip_text'] = True
+        elif settings.OCR_MODE == 'redo':
+            ocr_args['redo_ocr'] = True
+        elif settings.OCR_MODE == 'force':
+            ocr_args['force_ocr'] = True
+
+        if self.is_image(mime_type):
+            dpi = self.get_dpi(document_path)
+            if dpi:
+                self.log(
+                    "debug",
+                    f"Detected DPI for image {document_path}: {dpi}"
+                )
+                ocr_args['image_dpi'] = dpi
+            elif settings.OCR_IMAGE_DPI:
+                ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
+            else:
+                raise ParseError(
+                    f"Cannot produce archive PDF for image {document_path}, "
+                    f"no DPI information is present in this image and "
+                    f"OCR_IMAGE_DPI is not set.")
+
+        if settings.OCR_USER_ARGS:
+            try:
+                user_args = json.loads(settings.OCR_USER_ARGS)
+                ocr_args = {**ocr_args, **user_args}
+            except Exception as e:
+                self.log(
+                    "warning",
+                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+                    f"they will not be used: {e}")
+
+        # This forces tesseract to use one core per page.
+        os.environ['OMP_THREAD_LIMIT'] = "1"
+
+        try:
+            self.log("debug",
+                     f"Calling OCRmyPDF with {str(ocr_args)}")
+            ocrmypdf.ocr(**ocr_args)
+            # success! announce results
+            self.archive_path = archive_path
+            self.text = get_text_from_pdf(archive_path)
+
+        except InputFileError as e:
+            # This happens with some PDFs when used with the redo_ocr option.
+            # This is not the end of the world, we'll just use what we already
+            # have in the document.
+            self.text = get_text_from_pdf(document_path)
+            # Also, no archived file.
+            if not self.text:
+                # However, if we don't have anything, fail:
+                raise ParseError(e)
+
+        except Exception as e:
+            # Anything else is probably serious.
+            raise ParseError(e)
+
+        if not self.text:
+            # This may happen for files that don't have any text.
+            self.log(
+                'warning',
+                f"Document {document_path} does not have any text."
+                f"This is probably an error or you tried to add an image "
+                f"without text.")
+            self.text = ""


 def strip_excess_whitespace(text):
+    if not text:
+        return None
+
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(
        r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
    no_trailing_whitespace = re.sub(
        r"([^\S\n\r]+)$", '', no_leading_whitespace)
-    return no_trailing_whitespace

-
-def image_to_string(args):
-    img, lang = args
-    ocr = pyocr.get_available_tools()[0]
-    with Image.open(img) as f:
-        if ocr.can_detect_orientation():
-            try:
-                orientation = ocr.detect_orientation(f, lang=lang)
-                f = f.rotate(orientation["angle"], expand=1)
-            except Exception:
-                # Rotation not possible, ignore
-                pass
-        try:
-            return ocr.image_to_string(f, lang=lang)
-        except PyocrException as e:
-            raise OCRError(e)
+    # TODO: this needs a rework
+    return no_trailing_whitespace.strip()


 def get_text_from_pdf(pdf_file):
@ -245,6 +190,9 @@ def get_text_from_pdf(pdf_file):
        try:
            pdf = pdftotext.PDF(f)
        except pdftotext.Error:
-            return ""
+            # might not be a PDF file
+            return None

-    return "\n".join(pdf)
+    text = "\n".join(pdf)
+
+    return strip_excess_whitespace(text)
--- a/src/paperless_tesseract/tests/samples/multi-page-digital.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-digital.pdf
--- a/src/paperless_tesseract/tests/samples/multi-page-images.pdf
+++ b/src/paperless_tesseract/tests/samples/multi-page-images.pdf
--- a/src/paperless_tesseract/tests/samples/no-text-alpha.png
+++ b/src/paperless_tesseract/tests/samples/no-text-alpha.png
--- a/src/paperless_tesseract/tests/samples/simple-alpha.png
+++ b/src/paperless_tesseract/tests/samples/simple-alpha.png
--- a/src/paperless_tesseract/tests/samples/simple-digital.pdf
+++ b/src/paperless_tesseract/tests/samples/simple-digital.pdf
--- a/src/paperless_tesseract/tests/samples/simple-no-dpi.png
+++ b/src/paperless_tesseract/tests/samples/simple-no-dpi.png
--- a/src/paperless_tesseract/tests/samples/simple.png
+++ b/src/paperless_tesseract/tests/samples/simple.png
--- a/src/paperless_tesseract/tests/samples/with-form.pdf
+++ b/src/paperless_tesseract/tests/samples/with-form.pdf
--- a/src/paperless_tesseract/tests/test_date.py
+++ b/src/paperless_tesseract/tests/test_date.py
@ -1,193 +0,0 @@
-import datetime
-import os
-import shutil
-from unittest import mock
-from uuid import uuid4
-
-from dateutil import tz
-from django.conf import settings
-from django.test import TestCase, override_settings
-
-from ..parsers import RasterisedDocumentParser
-
-
-class TestDate(TestCase):
-
-    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
-    SCRATCH = "/tmp/paperless-tests-{}".format(str(uuid4())[:8])
-
-    def setUp(self):
-        os.makedirs(self.SCRATCH, exist_ok=True)
-
-    def tearDown(self):
-        shutil.rmtree(self.SCRATCH)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_1(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = "lorem ipsum 130218 lorem ipsum"
-        self.assertEqual(document.get_date(), None)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_2(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = "lorem ipsum 2018 lorem ipsum"
-        self.assertEqual(document.get_date(), None)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_3(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = "lorem ipsum 20180213 lorem ipsum"
-        self.assertEqual(document.get_date(), None)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_4(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = "lorem ipsum 13.02.2018 lorem ipsum"
-        date = document.get_date()
-        self.assertEqual(
-            date,
-            datetime.datetime(
-                2018, 2, 13, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_5(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem "
-            "ipsum"
-        )
-        date = document.get_date()
-        self.assertEqual(
-            date,
-            datetime.datetime(
-                2018, 2, 13, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_6(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum\n"
-            "Wohnort\n"
-            "3100\n"
-            "IBAN\n"
-            "AT87 4534\n"
-            "1234\n"
-            "1234 5678\n"
-            "BIC\n"
-            "lorem ipsum"
-        )
-        self.assertEqual(document.get_date(), None)
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_7(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum\n"
-            "März 2019\n"
-            "lorem ipsum"
-        )
-        date = document.get_date()
-        self.assertEqual(
-            date,
-            datetime.datetime(
-                2019, 3, 1, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_8(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum\n"
-            "Wohnort\n"
-            "3100\n"
-            "IBAN\n"
-            "AT87 4534\n"
-            "1234\n"
-            "1234 5678\n"
-            "BIC\n"
-            "lorem ipsum\n"
-            "März 2020"
-        )
-        self.assertEqual(
-            document.get_date(),
-            datetime.datetime(
-                2020, 3, 1, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_date_format_9(self):
-        input_file = os.path.join(self.SAMPLE_FILES, "")
-        document = RasterisedDocumentParser(input_file, None)
-        document._text = (
-            "lorem ipsum\n"
-            "27. Nullmonth 2020\n"
-            "März 2020\n"
-            "lorem ipsum"
-        )
-        self.assertEqual(
-            document.get_date(),
-            datetime.datetime(
-                2020, 3, 1, 0, 0,
-                tzinfo=tz.gettz(settings.TIME_ZONE)
-            )
-        )
-
-    @mock.patch(
-        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
-        return_value="01-07-0590 00:00:00"
-    )
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_crazy_date_past(self, *args):
-        document = RasterisedDocumentParser("/dev/null", None)
-        document.get_text()
-        self.assertIsNone(document.get_date())
-
-    @mock.patch(
-        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
-        return_value="01-07-2350 00:00:00"
-    )
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_crazy_date_future(self, *args):
-        document = RasterisedDocumentParser("/dev/null", None)
-        document.get_text()
-        self.assertIsNone(document.get_date())
-
-    @mock.patch(
-        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
-        return_value="20 408000l 2475"
-    )
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_crazy_date_with_spaces(self, *args):
-        document = RasterisedDocumentParser("/dev/null", None)
-        document.get_text()
-        self.assertIsNone(document.get_date())
-
-    @mock.patch(
-        "paperless_tesseract.parsers.RasterisedDocumentParser.get_text",
-        return_value="No date in here"
-    )
-    @override_settings(FILENAME_DATE_ORDER="YMD")
-    @override_settings(SCRATCH_DIR=SCRATCH)
-    def test_filename_date_parse_invalid(self, *args):
-        document = RasterisedDocumentParser("/tmp/20 408000l 2475 - test.pdf", None)
-        document.get_text()
-        self.assertIsNone(document.get_date())
--- a/src/paperless_tesseract/tests/test_ocr.py
+++ b/src/paperless_tesseract/tests/test_ocr.py
@ -1,76 +0,0 @@
-import os
-from unittest import mock, skipIf
-
-import pyocr
-from django.test import TestCase
-from pyocr.libtesseract.tesseract_raw import \
-    TesseractError as OtherTesseractError
-
-from ..parsers import image_to_string, strip_excess_whitespace
-
-
-class FakeTesseract(object):
-
-    @staticmethod
-    def can_detect_orientation():
-        return True
-
-    @staticmethod
-    def detect_orientation(file_handle, lang):
-        raise OtherTesseractError("arbitrary status", "message")
-
-    @staticmethod
-    def image_to_string(file_handle, lang):
-        return "This is test text"
-
-
-class FakePyOcr(object):
-
-    @staticmethod
-    def get_available_tools():
-        return [FakeTesseract]
-
-
-class TestOCR(TestCase):
-
-    text_cases = [
-        ("simple     string", "simple string"),
-        (
-            "simple    newline\n   testing string",
-            "simple newline\ntesting string"
-        ),
-        (
-            "utf-8   строка с пробелами в конце  ",
-            "utf-8 строка с пробелами в конце"
-        )
-    ]
-
-    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
-    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
-
-    def test_strip_excess_whitespace(self):
-        for source, result in self.text_cases:
-            actual_result = strip_excess_whitespace(source)
-            self.assertEqual(
-                result,
-                actual_result,
-                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
-                    source,
-                    result,
-                    actual_result
-                )
-            )
-
-    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
-    @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
-    def test_image_to_string_with_text_free_page(self):
-        """
-        This test is sort of silly, since it's really just reproducing an odd
-        exception thrown by pyocr when it encounters a page with no text.
-        Actually running this test against an installation of Tesseract results
-        in a segmentation fault rooted somewhere deep inside pyocr where I
-        don't care to dig.  Regardless, if you run the consumer normally,
-        text-free pages are now handled correctly so long as we work around
-        this weird exception.
-        """
-        image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"])
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@ -1,46 +1,17 @@
 import os
-import shutil
-import tempfile
 import uuid
 from typing import ContextManager
 from unittest import mock

 from django.test import TestCase, override_settings
-from pyocr.error import TesseractError

 from documents.parsers import ParseError, run_convert
-from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError
+from documents.tests.utils import DirectoriesMixin
+from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, strip_excess_whitespace

 image_to_string_calls = []


-class FakeTesseract(object):
-
-    @staticmethod
-    def can_detect_orientation():
-        return True
-
-    @staticmethod
-    def detect_orientation(file_handle, lang):
-        raise TesseractError("arbitrary status", "message")
-
-    @staticmethod
-    def get_available_languages():
-        return ['eng', 'deu']
-
-    @staticmethod
-    def image_to_string(file_handle, lang):
-        image_to_string_calls.append((file_handle.name, lang))
-        return file_handle.read()
-
-
-class FakePyOcr(object):
-
-    @staticmethod
-    def get_available_tools():
-        return [FakeTesseract]
-
-
 def fake_convert(input_file, output_file, **kwargs):
    with open(input_file) as f:
        lines = f.readlines()
@ -50,12 +21,6 @@ def fake_convert(input_file, output_file, **kwargs):
            f2.write(line.strip())


-def fake_unpaper(pnm):
-    output = pnm + ".unpaper.pnm"
-    shutil.copy(pnm, output)
-    return output
-
-
 class FakeImageFile(ContextManager):
    def __init__(self, fname):
        self.fname = fname
@ -67,142 +32,50 @@ class FakeImageFile(ContextManager):
        return os.path.basename(self.fname)


-fake_image = FakeImageFile


-@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr)
-@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert)
-@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper)
-@mock.patch("paperless_tesseract.parsers.Image.open", open)
-class TestRasterisedDocumentParser(TestCase):
+class TestParser(DirectoriesMixin, TestCase):

-    def setUp(self):
-        self.scratch = tempfile.mkdtemp()
+    def assertContainsStrings(self, content, strings):
+        # Asserts that all strings appear in content, in the given order.
+        indices = [content.index(s) for s in strings]
+        self.assertListEqual(indices, sorted(indices))

-        global image_to_string_calls
+    text_cases = [
+        ("simple     string", "simple string"),
+        (
+            "simple    newline\n   testing string",
+            "simple newline\ntesting string"
+        ),
+        (
+            "utf-8   строка с пробелами в конце  ",
+            "utf-8 строка с пробелами в конце"
+        )
+    ]

-        image_to_string_calls = []
-
-        override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable()
-
-    def tearDown(self):
-        shutil.rmtree(self.scratch)
-
-    def get_input_file(self, pages):
-        _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch)
-        with open(fname, "w") as f:
-            f.writelines([f"line {p}\n" for p in range(pages)])
-        return fname
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
-    def test_parse_text_simple_language_match(self):
-        parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
-    def test_parse_text_2_pages(self):
-        parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en")
-    def test_parse_text_3_pages(self):
-        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1 line 2")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None)
-    def test_parse_text_lang_detect_failed(self):
-        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1 line 2")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it")
-    def test_parse_text_lang_not_installed(self):
-        parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1 line 2 line 3")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
-    def test_parse_text_lang_mismatch(self):
-        parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4())
-        text = parser.get_text()
-        self.assertEqual(text, "line 0 line 1 line 2")
-
-        self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"])
-
-    @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de")
-    def test_parse_empty_doc(self):
-        parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4())
-        try:
-            parser.get_text()
-        except ParseError as e:
-            self.assertEqual("Empty document, nothing to do.", str(e))
-        else:
-            self.fail("Should raise exception")
-
-
-class TestAuxilliaryFunctions(TestCase):
-
-    def setUp(self):
-        self.scratch = tempfile.mkdtemp()
-
-        override_settings(SCRATCH_DIR=self.scratch).enable()
-
-    def tearDown(self):
-        shutil.rmtree(self.scratch)
+    def test_strip_excess_whitespace(self):
+        for source, result in self.text_cases:
+            actual_result = strip_excess_whitespace(source)
+            self.assertEqual(
+                result,
+                actual_result,
+                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
+                    source,
+                    result,
+                    actual_result
+                )
+            )

    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")

    def test_get_text_from_pdf(self):
-        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.pdf'))
+        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'))

-        self.assertEqual(text.strip(), "This is a test document.")
-
-    def test_get_text_from_pdf_error(self):
-        text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png'))
-
-        self.assertEqual(text.strip(), "")
-
-    def test_image_to_string(self):
-        text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng"))
-
-        self.assertEqual(text, "This is a test document.")
-
-    def test_image_to_string_language_unavailable(self):
-        try:
-            image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita"))
-        except OCRError as e:
-            self.assertTrue("Failed loading language" in str(e))
-        else:
-            self.fail("Should raise exception")
-
-    @override_settings(OCR_ALWAYS=False)
-    @mock.patch("paperless_tesseract.parsers.get_text_from_pdf")
-    @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale")
-    def test_is_ocred(self, m2, m):
-        parser = RasterisedDocumentParser("", uuid.uuid4())
-        m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \
-                         "lots of text lots of text lots of text lots of text lots of text lots of text " \
-                         "lots of text lots of text lots of text lots of text lots of text lots of text "
-        parser.get_text()
-        self.assertEqual(m.call_count, 2)
-        self.assertEqual(m2.call_count, 0)
+        self.assertContainsStrings(text.strip(), ["This is a test document."])

    def test_thumbnail(self):
-        parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
-        parser.get_thumbnail()
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
        # dont really know how to test it, just call it and assert that it does not raise anything.

    @mock.patch("paperless_tesseract.parsers.run_convert")
@ -216,6 +89,161 @@ class TestAuxilliaryFunctions(TestCase):

        m.side_effect = call_convert

-        parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4())
-        parser.get_thumbnail()
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf")
        # dont really know how to test it, just call it and assert that it does not raise anything.
+
+    def test_get_dpi(self):
+        parser = RasterisedDocumentParser(None)
+
+        dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
+        self.assertEqual(dpi, None)
+
+        dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png"))
+        self.assertEqual(dpi, 72)
+
+    def test_simple_digital(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf")
+
+        self.assertTrue(os.path.isfile(parser.archive_path))
+
+        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
+
+    def test_with_form(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
+
+        self.assertTrue(os.path.isfile(parser.archive_path))
+
+        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
+
+    @override_settings(OCR_MODE="redo")
+    def test_with_form_error(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
+
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
+
+    @override_settings(OCR_MODE="redo")
+    @mock.patch("paperless_tesseract.parsers.get_text_from_pdf", lambda _: None)
+    def test_with_form_error_notext(self):
+        parser = RasterisedDocumentParser(None)
+
+        def f():
+            parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
+
+        self.assertRaises(ParseError, f)
+
+    @override_settings(OCR_MODE="force")
+    def test_with_form_force(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
+
+        self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])
+
+    def test_image_simple(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")
+
+        self.assertTrue(os.path.isfile(parser.archive_path))
+
+        self.assertContainsStrings(parser.get_text(), ["This is a test document."])
+
+    def test_image_simple_alpha_fail(self):
+        parser = RasterisedDocumentParser(None)
+
+        def f():
+            parser.parse(os.path.join(self.SAMPLE_FILES, "simple-alpha.png"), "image/png")
+
+        self.assertRaises(ParseError, f)
+
+
+    def test_image_no_dpi_fail(self):
+        parser = RasterisedDocumentParser(None)
+
+        def f():
+            parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
+
+        self.assertRaises(ParseError, f)
+
+    @override_settings(OCR_IMAGE_DPI=72)
+    def test_image_no_dpi_default(self):
+        parser = RasterisedDocumentParser(None)
+
+        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
+
+        self.assertTrue(os.path.isfile(parser.archive_path))
+
+        self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."])
+
+    def test_multi_page(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="skip")
+    def test_multi_page_pages_skip(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
+    def test_multi_page_pages_redo(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="force")
+    def test_multi_page_pages_force(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OOCR_MODE="skip")
+    def test_multi_page_analog_pages_skip(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_PAGES=2, OCR_MODE="redo")
+    def test_multi_page_analog_pages_redo(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
+        self.assertFalse("page 3" in parser.get_text().lower())
+
+    @override_settings(OCR_PAGES=1, OCR_MODE="force")
+    def test_multi_page_analog_pages_force(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
+        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
+        self.assertFalse("page 2" in parser.get_text().lower())
+        self.assertFalse("page 3" in parser.get_text().lower())
+
+    @override_settings(OCR_MODE="skip_noarchive")
+    def test_skip_noarchive_withtext(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf")
+        self.assertIsNone(parser.archive_path)
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
+
+    @override_settings(OCR_MODE="skip_noarchive")
+    def test_skip_noarchive_notext(self):
+        parser = RasterisedDocumentParser(None)
+        parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf")
+        self.assertTrue(os.path.join(parser.archive_path))
+        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@ -11,11 +11,7 @@ class TextDocumentParser(DocumentParser):
    This parser directly parses a text document (.txt, .md, or .csv)
    """

-    def __init__(self, path, logging_group):
-        super().__init__(path, logging_group)
-        self._text = None
-
-    def get_thumbnail(self):
+    def get_thumbnail(self, document_path, mime_type):
        """
        The thumbnail of a text file is just a 500px wide image of the text
        rendered onto a letter-sized page.
@ -46,7 +42,7 @@ class TextDocumentParser(DocumentParser):
            )

        def read_text():
-            with open(self.document_path, 'r') as src:
+            with open(document_path, 'r') as src:
                lines = [line.strip() for line in src.readlines()]
                text = "\n".join([line for line in lines[:n_lines]])
                return text.replace('"', "'")
@ -76,15 +72,9 @@ class TextDocumentParser(DocumentParser):

        return out_path

-    def get_text(self):
-
-        if self._text is not None:
-            return self._text
-
-        with open(self.document_path, 'r') as f:
-            self._text = f.read()
-
-        return self._text
+    def parse(self, document_path, mime_type):
+        with open(document_path, 'r') as f:
+            self.text = f.read()


 def run_command(*args):