From 28cd246d48b3c349ad986f0e29b980fefe66640f Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:45:21 +0100 Subject: [PATCH 001/121] added archive directory. --- src/documents/consumer.py | 1 + src/paperless/settings.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 65febc937..b273d331d 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -65,6 +65,7 @@ class Consumer(LoggingMixin): os.makedirs(settings.SCRATCH_DIR, exist_ok=True) os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) + os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) def try_consume_file(self, path, diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 1432dc5ec..66f9fee4b 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media")) ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals") +ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive") THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) From 17b62b61fa60cf0dd93b51fe4472f51297d0a53e Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:47:01 +0100 Subject: [PATCH 002/121] add support for archive files. --- src/documents/consumer.py | 17 +++++++++++++---- src/documents/models.py | 13 +++++++++++++ src/documents/parsers.py | 3 +++ src/documents/signals/handlers.py | 16 +++++++++++----- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b273d331d..b6a0a5912 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -134,6 +134,7 @@ class Consumer(LoggingMixin): self.log("debug", "Parsing {}...".format(self.filename)) text = document_parser.get_text() date = document_parser.get_date() + archive_path = document_parser.get_archive_path() except ParseError as e: document_parser.cleanup() raise ConsumerError(e) @@ -178,8 +179,16 @@ class Consumer(LoggingMixin): # place. If this fails, we'll also rollback the transaction. create_source_path_directory(document.source_path) - self._write(document, self.path, document.source_path) - self._write(document, thumbnail, document.thumbnail_path) + + self._write(document.storage_type, + self.path, document.source_path) + + self._write(document.storage_type, + thumbnail, document.thumbnail_path) + + if archive_path and os.path.isfile(archive_path): + self._write(Document.STORAGE_TYPE_UNENCRYPTED, + archive_path, document.archive_path) # Delete the file only if it was successfully consumed self.log("debug", "Deleting file {}".format(self.path)) @@ -258,10 +267,10 @@ class Consumer(LoggingMixin): for tag_id in self.override_tag_ids: document.tags.add(Tag.objects.get(pk=tag_id)) - def _write(self, document, source, target): + def _write(self, storage_type, source, target): with open(source, "rb") as read_file: with open(target, "wb") as write_file: - if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: + if storage_type == Document.STORAGE_TYPE_UNENCRYPTED: write_file.write(read_file.read()) return self.log("debug", "Encrypting") diff --git a/src/documents/models.py b/src/documents/models.py index 8e0435647..c1ab9a44d 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -224,6 +224,19 @@ class Document(models.Model): def source_file(self): return open(self.source_path, "rb") + @property + def archive_path(self): + fname = "{:07}{}".format(self.pk, ".pdf") + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + @property + def archive_file(self): + return open(self.archive_path, "rb") + @property def file_name(self): return slugify(str(self)) + self.file_type diff --git a/src/documents/parsers.py b/src/documents/parsers.py index eb8ccf45e..3ad60dccd 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -141,6 +141,9 @@ class DocumentParser(LoggingMixin): self.tempdir = tempfile.mkdtemp( prefix="paperless-", dir=settings.SCRATCH_DIR) + def get_archive_path(self): + return None + def get_thumbnail(self): """ Returns the path to a file we can use as a thumbnail for this document. diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index f83f88783..9672b884b 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -168,11 +168,17 @@ def run_post_consume_script(sender, document, **kwargs): @receiver(models.signals.post_delete, sender=Document) def cleanup_document_deletion(sender, instance, using, **kwargs): - for f in (instance.source_path, instance.thumbnail_path): - try: - os.unlink(f) - except FileNotFoundError: - pass # The file's already gone, so we're cool with it. + for f in (instance.source_path, + instance.archive_path, + instance.thumbnail_path): + if os.path.isfile(f): + try: + os.unlink(f) + except OSError as e: + logging.getLogger(__name__).warning( + f"While deleting document {instance.file_name}, the file " + f"{f} could not be deleted: {e}" + ) delete_empty_directories(os.path.dirname(instance.source_path)) From ac6c72a6c92295daa1902f6e612eeac0c082d701 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:48:36 +0100 Subject: [PATCH 003/121] api serves archive files by default. --- src/documents/views.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/documents/views.py b/src/documents/views.py index 14323e933..5c8a0d9b9 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,3 +1,5 @@ +import os + from django.db.models import Count, Max from django.http import HttpResponse, HttpResponseBadRequest, Http404 from django.views.decorators.cache import cache_control @@ -126,15 +128,25 @@ class DocumentViewSet(RetrieveModelMixin, index.remove_document_from_index(self.get_object()) return super(DocumentViewSet, self).destroy(request, *args, **kwargs) - def file_response(self, pk, disposition): - doc = Document.objects.get(id=pk) + @staticmethod + def original_requested(request): + return ( + 'original' in request.query_params and + request.query_params['original'] == 'true' + ) - if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: + def file_response(self, pk, request, disposition): + doc = Document.objects.get(id=pk) + mime_type = doc.mime_type + if not self.original_requested(request) and os.path.isfile(doc.archive_path): + file_handle = doc.archive_file + mime_type = 'application/pdf' + elif doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: file_handle = doc.source_file else: file_handle = GnuPG.decrypted(doc.source_file) - response = HttpResponse(file_handle, content_type=doc.mime_type) + response = HttpResponse(file_handle, content_type=mime_type) response["Content-Disposition"] = '{}; filename="{}"'.format( disposition, doc.file_name) return response @@ -152,7 +164,8 @@ class DocumentViewSet(RetrieveModelMixin, @action(methods=['get'], detail=True) def preview(self, request, pk=None): try: - response = self.file_response(pk, "inline") + response = self.file_response( + pk, request, "inline") return response except FileNotFoundError: raise Http404("Document source file does not exist") @@ -169,7 +182,8 @@ class DocumentViewSet(RetrieveModelMixin, @action(methods=['get'], detail=True) def download(self, request, pk=None): try: - return self.file_response(pk, "attachment") + return self.file_response( + pk, request, "attachment") except FileNotFoundError: raise Http404("Document source file does not exist") From 15935ab61f89bb52bc5700e61a818a254f4d87b1 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:50:43 +0100 Subject: [PATCH 004/121] reworked PDF parser that uses OCRmyPDF and produces archive files. --- Pipfile | 2 +- Pipfile.lock | 292 +++++++++++++++++++++++++++-- docs/configuration.rst | 34 +++- paperless.conf.example | 3 +- src/documents/parsers.py | 17 -- src/paperless/settings.py | 8 +- src/paperless_tesseract/parsers.py | 204 ++++++-------------- 7 files changed, 374 insertions(+), 186 deletions(-) diff --git a/Pipfile b/Pipfile index ad60e0905..079037f15 100644 --- a/Pipfile +++ b/Pipfile @@ -23,7 +23,6 @@ langdetect = "*" pdftotext = "*" pathvalidate = "*" pillow = "*" -pyocr = "~=0.7.2" python-gnupg = "*" python-dotenv = "*" python-dateutil = "*" @@ -35,6 +34,7 @@ scikit-learn="~=0.23.2" whitenoise = "~=5.2.0" watchdog = "*" whoosh="~=2.7.4" +ocrmypdf = "*" [dev-packages] coveralls = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 6ecca3c34..39c35c2d9 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ae2643b9cf0cf5741ae149fb6bc0c480de41329ce48e773eb4b5d760bc5e2244" + "sha256": "cf1c008df0080c01273c032aef59bd841e4f27b66beaf3fa459665a7a7a4fcc4" }, "pipfile-spec": 6, "requires": {}, @@ -42,6 +42,94 @@ ], "version": "==1.17.11" }, + "cffi": { + "hashes": [ + "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e", + "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d", + "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a", + "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec", + "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362", + "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668", + "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c", + "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b", + "sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654", + "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06", + "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698", + "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2", + "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c", + "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7", + "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009", + "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03", + "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b", + "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909", + "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53", + "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35", + "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26", + "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b", + "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb", + "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293", + "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd", + "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d", + "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3", + "sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5", + "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d", + "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca", + "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d", + "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775", + "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375", + "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b", + "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b", + "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f" + ], + "version": "==1.14.4" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "markers": "python_version >= '3.1'", + "version": "==3.0.4" + }, + "coloredlogs": { + "hashes": [ + "sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a", + "sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505", + "sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==14.0" + }, + "cryptography": { + "hashes": [ + "sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538", + "sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f", + "sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d", + "sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77", + "sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b", + "sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33", + "sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e", + "sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb", + "sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e", + "sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b", + "sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7", + "sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297", + "sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d", + "sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7", + "sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b", + "sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7", + "sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4", + "sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8", + "sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b", + "sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851", + "sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13", + "sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b", + "sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3", + "sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.2.1" + }, "dateparser": { "hashes": [ "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b", @@ -121,6 +209,14 @@ "index": "pypi", "version": "==20.0.4" }, + "humanfriendly": { + "hashes": [ + "sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12", + "sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==8.2" + }, "imap-tools": { "hashes": [ "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246", @@ -129,6 +225,13 @@ "index": "pypi", "version": "==0.32.0" }, + "img2pdf": { + "hashes": [ + "sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5", + "sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9" + ], + "version": "==0.4.0" + }, "joblib": { "hashes": [ "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72", @@ -146,6 +249,51 @@ "index": "pypi", "version": "==1.0.8" }, + "lxml": { + "hashes": [ + "sha256:098fb713b31050463751dcc694878e1d39f316b86366fb9fe3fbbe5396ac9fab", + "sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b", + "sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5", + "sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301", + "sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b", + "sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d", + "sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b", + "sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9", + "sha256:288ddf94d9d0488187f578fdcc1868af2a6fe6714444c8259b68a83fa27b76d2", + "sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b", + "sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311", + "sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891", + "sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a", + "sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1", + "sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856", + "sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810", + "sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51", + "sha256:4e006fdb434609956a8f710ffffe650afab414dc43728786ebdbdca48e179b14", + "sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360", + "sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4", + "sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f", + "sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230", + "sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a", + "sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f", + "sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174", + "sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf", + "sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd", + "sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3", + "sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a", + "sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5", + "sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367", + "sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c", + "sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1", + "sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8", + "sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f", + "sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc", + "sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d", + "sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9", + "sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==4.6.1" + }, "numpy": { "hashes": [ "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db", @@ -187,6 +335,14 @@ "markers": "python_version >= '3.6'", "version": "==1.19.4" }, + "ocrmypdf": { + "hashes": [ + "sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3", + "sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928" + ], + "index": "pypi", + "version": "==11.3.3" + }, "pathtools": { "hashes": [ "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0", @@ -202,6 +358,14 @@ "index": "pypi", "version": "==2.3.0" }, + "pdfminer.six": { + "hashes": [ + "sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a", + "sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509" + ], + "markers": "python_version >= '3.4'", + "version": "==20201018" + }, "pdftotext": { "hashes": [ "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6" @@ -209,6 +373,33 @@ "index": "pypi", "version": "==2.1.5" }, + "pikepdf": { + "hashes": [ + "sha256:0dd42f791f29e7e2ab120103605b9ddd65937c773a72d21341a56873a89e76c9", + "sha256:12a1d243143cf972ce11def50f0bd1f6e630f5e660cdeddb2c7c49db5adad40a", + "sha256:2e1713af11b71e95c2d218c10d68b6f8e813be19c8596c560f3c84617f6d5437", + "sha256:2f90acad26d9939193946eb6ca8363fd3cf44b46b5c1409468906618bccb8113", + "sha256:3c482fe30fd58ff385795605a9233f37f97fb83427c3e829b1a568a2a3b59f60", + "sha256:3ddabfc33a8a7cecba76c1685ce5125fdf239a38d0854d7c2a703490b5783773", + "sha256:61dd3f13b7416111d19bf493ce4e7281f63a1dd22c532200cbbcd65813ea43e4", + "sha256:6ce42b7780835fb52452ccaff3a3ac1b28ae1f9d80faab59c559045d9fcb211d", + "sha256:6dba75782f108ebbf3947fcb29ea0ba7da0482868e53f6602643adc36245201d", + "sha256:716427a5c0372f3cc7dc282c4b49d49d8d5182a3e937739a4c3632151e74d6a4", + "sha256:730ef4013099da7ea722a9b5659260097af6f47ddfa3c2abab4d4493de2591f3", + "sha256:73e14bba4135adfb89ae2f2163369bd788ecf23839acc8d062d832118f07e288", + "sha256:84df07acc8968051da33891af55a3ab1aa55453d83df4ce9b84d821eedc34583", + "sha256:8f739e9c660d71cd479f11f9aa110857cf0d0d9c2472f40bbcbaf02f980355a1", + "sha256:a20ca7adbb9d3da416cf5f6de0ebca53855f9a3b99acdd6ec864c61482894d71", + "sha256:bc58d9486c0959619a2584e558a54d36468c6d1165cd9fe0bfb1ecc3e6b33c6a", + "sha256:c0627930a17b3a5e1a7c9109099535259afc50fe006a05af9c3634de05abd318", + "sha256:de5f445eaaadd7dae56e1043ab8ca5eef49ece302a4e37e1fc6d21b7dcfcfb1b", + "sha256:de6aae7782db33f2cc71c9ba63b7e2ec0e0529843c065eac4e71fcbe043426e2", + "sha256:e2efd844c09f8ce3103a93bfbd54983542a0a63c88bdc0f0cdbb2997f99a147d", + "sha256:fdb481ad1219e8d667625afd2f01b26f98df079e4f66e7e49816ec20c8d8c401" + ], + "markers": "python_version < '3.9'", + "version": "==2.1.2" + }, "pillow": { "hashes": [ "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a", @@ -244,6 +435,14 @@ "index": "pypi", "version": "==8.0.1" }, + "pluggy": { + "hashes": [ + "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", + "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.13.1" + }, "psycopg2-binary": { "hashes": [ "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c", @@ -287,13 +486,13 @@ "index": "pypi", "version": "==2.8.6" }, - "pyocr": { + "pycparser": { "hashes": [ - "sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179", - "sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301" + "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", + "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" ], - "index": "pypi", - "version": "==0.7.2" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.20" }, "python-dateutil": { "hashes": [ @@ -401,6 +600,53 @@ ], "version": "==2020.11.13" }, + "reportlab": { + "hashes": [ + "sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540", + "sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c", + "sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030", + "sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f", + "sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5", + "sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e", + "sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89", + "sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2", + "sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f", + "sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969", + "sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53", + "sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d", + "sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d", + "sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457", + "sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec", + "sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f", + "sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778", + "sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92", + "sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6", + "sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077", + "sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673", + "sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b", + "sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48", + "sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95", + "sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94", + "sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589", + "sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9", + "sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8", + "sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d", + "sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32", + "sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b", + "sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a", + "sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2", + "sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967", + "sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5", + "sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437", + "sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8", + "sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab", + "sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e", + "sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a", + "sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8", + "sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08" + ], + "version": "==3.5.55" + }, "scikit-learn": { "hashes": [ "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e", @@ -464,6 +710,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.15.0" }, + "sortedcontainers": { + "hashes": [ + "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f", + "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1" + ], + "version": "==2.3.0" + }, "sqlparse": { "hashes": [ "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0", @@ -480,6 +733,14 @@ "markers": "python_version >= '3.5'", "version": "==2.1.0" }, + "tqdm": { + "hashes": [ + "sha256:3d3f1470d26642e88bd3f73353cb6ff4c51ef7d5d7efef763238f4bc1f7e4e81", + "sha256:5ff3f5232b19fa4c5531641e480b7fad4598819f708a32eb815e6ea41c5fa313" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==4.53.0" + }, "tzlocal": { "hashes": [ "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44", @@ -489,11 +750,11 @@ }, "watchdog": { "hashes": [ - "sha256:034c85530b647486e8c8477410fe79476511282658f2ce496f97106d9e5acfb8", - "sha256:4214e1379d128b0588021880ccaf40317ee156d4603ac388b9adcf29165e0c04" + "sha256:3caefdcc8f06a57fdc5ef2d22aa7c0bfda4f55e71a0bee74cbf3176d97536ef3", + "sha256:e38bffc89b15bafe2a131f0e1c74924cf07dcec020c2e0a26cccd208831fcd43" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.10.4" }, "wcwidth": { "hashes": [ @@ -571,6 +832,7 @@ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" ], + "markers": "python_version >= '3.1'", "version": "==3.0.4" }, "coverage": { @@ -663,11 +925,11 @@ }, "faker": { "hashes": [ - "sha256:3f5d379e4b5ce92a8afe3c2ce59d7c43886370dd3bf9495a936b91888debfc81", - "sha256:8c0e8a06acef4b9312902e2ce18becabe62badd3a6632180bd0680c6ee111473" + "sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997", + "sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e" ], "markers": "python_version >= '3.5'", - "version": "==4.17.0" + "version": "==4.17.1" }, "filelock": { "hashes": [ @@ -999,11 +1261,11 @@ }, "virtualenv": { "hashes": [ - "sha256:b0011228208944ce71052987437d3843e05690b2f23d1c7da4263fde104c97a2", - "sha256:b8d6110f493af256a40d65e29846c69340a947669eec8ce784fcf3dd3af28380" + "sha256:07cff122e9d343140366055f31be4dcd61fd598c69d11cd33a9d9c8df4546dd7", + "sha256:e0aac7525e880a429764cefd3aaaff54afb5d9f25c82627563603f5d7de5a6e5" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==20.1.0" + "version": "==20.2.1" } } } diff --git a/docs/configuration.rst b/docs/configuration.rst index c3f01c2ca..ad1c7c117 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -218,11 +218,37 @@ PAPERLESS_OCR_LANGUAGE= Defaults to "eng". -PAPERLESS_OCR_ALWAYS= - By default Paperless does not OCR a document if the text can be retrieved from - the document directly. Set to true to always OCR documents. +PAPERLESS_OCR_MODE= + Tell paperless when and how to perform ocr on your documents. Three modes + are available: - Defaults to false. + * ``skip``: Paperless skips all pages and will perform ocr only on pages + where no text is present. This is the safest and fastest option. + * ``redo``: Paperless will OCR all pages of your documents and attempt to + replace any existing text layers with new text. This will be useful for + documents from scanners that already performed OCR with insufficient + results. It will also perform OCR on purely digital documents. + + This option may fail on some documents that have features that cannot + be removed, such as forms. In this case, the text from the document is + used instead. + * ``force``: Paperless rasterizes your documents, converting any text + into images and puts the OCRed text on top. This works for all documents, + however, the resulting document may be significantly larger and text + won't appear as sharp when zoomed in. + + The default is ``skip``, which only performs OCR when necessary. + +PAPERLESS_OCR_OUTPUT_TYPE= + Specify the the type of PDF documents that paperless should produce. + + * ``pdf``: Modify the PDF document as little as possible. + * ``pdfa``: Convert PDF documents into PDF/A documents, which is a + subset of the entire PDF specification and meant for storing + documents long term. + + If not specified, ``pdfa`` is used. Remember that paperless also keeps + the original input file as well as the archived version. PAPERLESS_CONSUMER_POLLING= If paperless won't find documents added to your consume folder, it might diff --git a/paperless.conf.example b/paperless.conf.example index 4962c1567..34e560507 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -38,7 +38,8 @@ #PAPERLESS_TIME_ZONE=UTC #PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_LANGUAGE=eng -#PAPERLESS_OCR_ALWAYS=false +#PAPERLESS_OCR_OUTPUT_TYPE=pdfa +#PAPERLESS_OCR_MODE=skip #PAPERLESS_CONSUMER_POLLING=10 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false #PAPERLESS_CONVERT_MEMORY_LIMIT=0 diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 3ad60dccd..542a5dae9 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -107,23 +107,6 @@ def run_convert(input_file, raise ParseError("Convert failed at {}".format(args)) -def run_unpaper(pnm, logging_group=None): - pnm_out = pnm.replace(".pnm", ".unpaper.pnm") - - command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, - pnm_out) - - logger.debug(f"Execute: {' '.join(command_args)}", - extra={'group': logging_group}) - - if not subprocess.Popen(command_args, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL).wait() == 0: - raise ParseError(f"Unpaper failed at {command_args}") - - return pnm_out - - class ParseError(Exception): pass diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 66f9fee4b..5cede45c4 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -338,9 +338,13 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") +# OCRmyPDF --output-type options are available. +# TODO: validate this setting. +OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") -# OCR all documents? -OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false") +# skip. redo, force +# TODO: validate this. +OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b8320a4f0..8f694ef56 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -1,23 +1,14 @@ -import itertools import os import re import subprocess -from multiprocessing.pool import ThreadPool import langdetect +import ocrmypdf import pdftotext -import pyocr -from PIL import Image from django.conf import settings -from pyocr import PyocrException +from ocrmypdf import InputFileError -from documents.parsers import DocumentParser, ParseError, run_unpaper, \ - run_convert -from .languages import ISO639 - - -class OCRError(Exception): - pass +from documents.parsers import DocumentParser, ParseError, run_convert class RasterisedDocumentParser(DocumentParser): @@ -29,6 +20,7 @@ class RasterisedDocumentParser(DocumentParser): def __init__(self, path, logging_group): super().__init__(path, logging_group) self._text = None + self._archive_path = None def get_thumbnail(self): """ @@ -74,113 +66,67 @@ class RasterisedDocumentParser(DocumentParser): return out_path - def _is_ocred(self): - - # Extract text from PDF using pdftotext - text = get_text_from_pdf(self.document_path) - - # We assume, that a PDF with at least 50 characters contains text - # (so no OCR required) - return len(text) > 50 - def get_text(self): - if self._text is not None: + if self._text: return self._text - if not settings.OCR_ALWAYS and self._is_ocred(): - self.log("debug", "Skipping OCR, using Text from PDF") - self._text = get_text_from_pdf(self.document_path) - return self._text + archive_path = os.path.join(self.tempdir, "archive.pdf") - images = self._get_greyscale() + ocr_args = { + 'input_file': self.document_path, + 'output_file': archive_path, + 'use_threads': True, + 'jobs': settings.THREADS_PER_WORKER, + 'language': settings.OCR_LANGUAGE, + 'output_type': settings.OCR_OUTPUT_TYPE, + 'progress_bar': False, + 'clean': True + } - if not images: - raise ParseError("Empty document, nothing to do.") + if settings.OCR_PAGES > 0: + ocr_args['pages'] = f"1-{settings.OCR_PAGES}" + + if settings.OCR_MODE == 'skip': + ocr_args['skip_text'] = True + elif settings.OCR_MODE == 'redo': + ocr_args['redo_ocr'] = True + elif settings.OCR_MODE == 'force': + ocr_args['force_ocr'] = True try: + ocrmypdf.ocr(**ocr_args) + # success! announce that we have an archive document + self._archive_path = archive_path + self._text = get_text_from_pdf(self._archive_path) - sample_page_index = int(len(images) / 2) - self.log( - "debug", - f"Attempting language detection on page " - f"{sample_page_index + 1} of {len(images)}...") + except InputFileError as e: + # This happens with some PDFs when used with the redo_ocr option. + # This is not the end of the world, we'll just use what we already + # have in the document. + self._text = get_text_from_pdf(self.document_path) + # Also, no archived file. + if not self._text: + # However, if we don't have anything, fail: + raise ParseError(e) - sample_page_text = self._ocr([images[sample_page_index]], - settings.OCR_LANGUAGE)[0] - guessed_language = self._guess_language(sample_page_text) - - if not guessed_language or guessed_language not in ISO639: - self.log("warning", "Language detection failed.") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - elif ISO639[guessed_language] == settings.OCR_LANGUAGE: - self.log( - "debug", - f"Detected language: {guessed_language} " - f"(default language)") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501 - self.log( - "warning", - f"Detected language {guessed_language} is not available " - f"on this system.") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - else: - self.log("debug", f"Detected language: {guessed_language}") - ocr_pages = self._ocr(images, ISO639[guessed_language]) - - self.log("debug", "OCR completed.") - self._text = strip_excess_whitespace(" ".join(ocr_pages)) - return self._text - - except OCRError as e: + except Exception as e: + # Anything else is probably serious. raise ParseError(e) - def _get_greyscale(self): - """ - Greyscale images are easier for Tesseract to OCR - """ + if not self._text: + # This may happen for files that don't have any text. + self.log( + 'warning', + f"Document {self.document_path} does not have any text." + f"This is probably an error or you tried to add an image " + f"without text.") + return "" - # Convert PDF to multiple PNMs - input_file = self.document_path + return self._text - if settings.OCR_PAGES == 1: - input_file += "[0]" - elif settings.OCR_PAGES > 1: - input_file += f"[0-{settings.OCR_PAGES - 1}]" - - self.log( - "debug", - f"Converting document {input_file} into greyscale images") - - output_files = os.path.join(self.tempdir, "convert-%04d.pnm") - - run_convert(density=settings.CONVERT_DENSITY, - depth="8", - type="grayscale", - input_file=input_file, - output_file=output_files, - logging_group=self.logging_group) - - # Get a list of converted images - pnms = [] - for f in os.listdir(self.tempdir): - if f.endswith(".pnm"): - pnms.append(os.path.join(self.tempdir, f)) - - self.log("debug", f"Running unpaper on {len(pnms)} pages...") - - # Run unpaper in parallel on converted images - with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: - pnms = pool.map(run_unpaper, pnms) - - return sorted(filter(lambda __: os.path.isfile(__), pnms)) + def get_archive_path(self): + return self._archive_path def _guess_language(self, text): try: @@ -190,30 +136,11 @@ class RasterisedDocumentParser(DocumentParser): self.log('warning', f"Language detection failed with: {e}") return None - def _ocr(self, imgs, lang): - self.log( - "debug", - f"Performing OCR on {len(imgs)} page(s) with language {lang}") - with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: - r = pool.map(image_to_string, itertools.product(imgs, [lang])) - return r - - def _complete_ocr_default_language(self, - images, - sample_page_index, - sample_page): - images_copy = list(images) - del images_copy[sample_page_index] - if images_copy: - self.log('debug', "Continuing ocr with default language.") - ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) - ocr_pages.insert(sample_page_index, sample_page) - return ocr_pages - else: - return [sample_page] - def strip_excess_whitespace(text): + if not text: + return None + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) no_leading_whitespace = re.sub( r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) @@ -222,29 +149,14 @@ def strip_excess_whitespace(text): return no_trailing_whitespace -def image_to_string(args): - img, lang = args - ocr = pyocr.get_available_tools()[0] - with Image.open(img) as f: - if ocr.can_detect_orientation(): - try: - orientation = ocr.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) - except Exception: - # Rotation not possible, ignore - pass - try: - return ocr.image_to_string(f, lang=lang) - except PyocrException as e: - raise OCRError(e) - - def get_text_from_pdf(pdf_file): with open(pdf_file, "rb") as f: try: pdf = pdftotext.PDF(f) except pdftotext.Error: - return "" + return None - return "\n".join(pdf) + text = "\n".join(pdf) + + return strip_excess_whitespace(text) From a10f516fe911427852e73a1096b89b43d9ae7105 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:51:00 +0100 Subject: [PATCH 005/121] todo note. --- src/documents/file_handling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index ee7e9b761..179c97492 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -10,6 +10,7 @@ def create_source_path_directory(source_path): os.makedirs(os.path.dirname(source_path), exist_ok=True) +# TODO: also make this work for archive dir def delete_empty_directories(directory): # Go up in the directory hierarchy and try to delete all directories directory = os.path.normpath(directory) From f5656222e259f6a198ed5648a7fa853730277268 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:51:32 +0100 Subject: [PATCH 006/121] removed obsolete tests. --- .../tests/samples/no-text.png | Bin 32595 -> 0 bytes src/paperless_tesseract/tests/test_ocr.py | 44 +----- src/paperless_tesseract/tests/test_parser.py | 149 +----------------- 3 files changed, 3 insertions(+), 190 deletions(-) delete mode 100644 src/paperless_tesseract/tests/samples/no-text.png diff --git a/src/paperless_tesseract/tests/samples/no-text.png b/src/paperless_tesseract/tests/samples/no-text.png deleted file mode 100644 index e78b22bfbe53be5a9046bbfb32fef7e98abb9be9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32595 zcmaI7b9AIZw>O%JZQEAI$pjPIHafPAiEZ2F#I|iuJh46D&AjKF@4J88b9?pbr>fT8 zyEcBctGcS5a0NMW1Xx^H5D*XqNeK}p5Rh*)U*%nB$gkfdRpG0z2adC-hO@Grsk57* zqX~$hv7M0#v80WmnTe8#p|OXtPBs(&W6$O9}T0sjr|ul2ner$ zyS*XM%EX!2$i&RTmXGwRqlc8(!kCX#on4k$)?V1e+(N?B(L~u(P6g;`1>`m+72qf4 zb?5mKU}NHJNbGK7ZR^D2&PVzmxjbLxf6Yv!#Q!02w&ElGFQqhO6^MoH98HMX8QB?t z%q#$6PHsj3I~z9uKu^rV3}9sfFf*}mGXMZQY@9p*4&wj*kbd>%Xl%-(BqH|TeSOvV zNX?y{?Rl7(+}zw4-B=mz9L<;j+}zwu%q&bSEDT>13{D=l&W7#`woYXKBSFN(3Fv5H z?`&abOZ-owp^=@7Gau=frvGJvjs5=-YwPsiZu)W*9zm|E5lXkq)WPV*lISy>)QTPJ5jTcC-g2p{Q}2u2GFV;*5vF;*dV4t5bS zVKD$elvRk6MO2stz{MiO$t@zt$@M?HB6dI*8xvdS|KTx4{pCI z{|EUdwqI~}{KB;{o)kFBh@gu5`nfKIx5~&0pM#ql1!LTg*&uM#O@>x!2L`21 z4ehG0uR;Wdn;Sv`2uu*No7RbnkmmOC7vSy zdCAXO{@0hFp%&t@Uk4^w7WEY&a4#`80a*d?Z@Cb=Et-DB6r_By-dj<~s-PRd zf9l;djY`Hy%K-<;hj!TKF&5NggNu?{dwn_O1o{ul{pO~cf{Ica_(t};Lbx^{wh;t1 z6H*(BG%Js3j2?$2&Ck|^j=QTj&pk4;C4gUM{8N%k&?6x=v4}?7%{(O6btk_w5`g5hD|Y#DjfTQA@@98ABu#{T&`H#4t2A+?7X6kdO|XAWHSy$F?`d|^ss zp0JK@{67SVc~yNkq*xa`%LIi-i01d)jD+1o8@%>>f0?bg%>L;=Bc^bB@CV%@0x`Q9 zaJ3R1%`fhtNaJnycVS9t-nF6YzvQ5Omu#40Q6IS#F!Ni4VI*ja?rijK_lS^Cbg)jX z{}-#HgaXKfRDjYgsz#?1Bzp26fm2>i`K=?HkN!v6f2i?h444ODPeOyw6ae9nJ0x<+ zS>eAzxx|v?WJ0DnMc8Pd|LFPokofYrDwN^r#H0%ELw; z*CmZ-1b6m7o&^?D96G0=_kP_O3VK_}He|fk>h!0g~6g5r33SQNuTLHbkfw86J*~3E9n<5 z|Hh9w{9!`L#j!0|OHQ@G3%M63zVK)RbmOsQEQa57Kvkzqw2v`{3AJQ-z3%kCz#=3U zP#e-Gsl|zu=3E%sn%yfNwK<*>PM^0%#BABzEX15NU8?IzS+Ni&UwB|{(`gL+T^SLY zPvTL4k-Fc%o!I981oxxohklJwb3s+nllwMr)NpJ{ym3LV@gEDyaHOIL z)-$;Z;{*k4eKL<+&L%k9{M`CG5;ZK2tL+#2Z*ZL^7nC6_;B#Z(xIa2juvAGD&it}n zcO9LWFnZfGRNz#pblNU@cJ6E&G55f#r6Q6uy#o0Mq(!IDH!6^fZawSoHZ3spYDR+1?nC^^$NJiyEr9g#NJKkMR|be+sQoW& z^rOG%Tg75d5@8@X?@W%34}7?Y1FS9e#YBs!85#Za+ia^xbOdfd@@L3S4iQO3MeB5D z{(H#4PEMXf65NlB`^|b8V@ZU;-O6aD8({9t z_L}3bxk6s$`2Ngk9eCoyCF{W0+qmbu=7lZyUU|oPo~ovF@xFvqQ$72KV0z zakZ@ha%YaavP$gJrMm}~&&ERc*E;?)z4h-%W`q?ta1Rc0X*KJxUSSF#M#qbex4bD!a*q@I}iT6k3qg%*mxp6EZ z7ad~G;BY7MqSN?-rx5n$iC^;+Lu-)=s1qFUBQzmlPovI(<%gE1Bc+toM0|P%%r=yr z!|OemwProkf?AY+PCiG2Uz{Jc;)UG6+HVvzGlbAE=%Xynwlz?2QNas+9gX@H|9{}M zq@Z!20dO_%mf4h#;WX7umRr)Y)@rp^$h>Utvc4GXzx`hAxxvQ#;6SML-r9Q^^TOtH zW2V2Dd@f2=*ZKL#r?t$bFQUMfTf4f^yXOVf(f$N6tLj9Kkv{1>*+@K)Yzcb{hwBnU z#3}Ee(M`+2_yXp}#6O(Xl=uuj6sM=fqvf?Mvm|t(#bn!c_@3Ush43%s1r@Ghz$vH3_}5z&r+T_eux-A!pm@TwcbZWgTz_;zr?bt~=RcbahJk;De|81uDGIrWRo6N^vfGfFTr0;+og z=bBH-h%l)eU2dJ=c0Rd~eL~&-^rYsbNG^6}xV)IB zKLnF;5TClxJ6E&q&aUCc z#-7aH;3j4zgJV%uYC^|A-;K>h+Fw)E=W1>lSf-;hyfB{r^nR@mZ0$V4)aA*NT!(!` z{F>fRKe0%{9g9gVH45~S8yhDKXertdEu#qsc$u3fP$zZx5Z663l1CogM$S6ya0<#{ zGscMbeBfoO5G=b6*TDo>DxDPbe}y4Ni?41_gY){u`f=tCr}xz6eI8tbR_r`d_z7=| zKt*!Q{BTsN01;GdeKZ%9jIi~rb4&UH@HQujVoE-weMZI4nlo*CZ3F^YL1jeNm8gOA z_m!Q!Euo)z)XoTO`5Q8D#T3QtHFtfSRdNXhaY?A+tlGd5m%|i;Qpg(uf%oXo=Il^i zD>`p?f{44<8sO*wf1N9{xVH>xXGr@pS85X5cDZtT*Th#YdFhr2uQ^(%OSUa)%g-=t zt_nbBW5dWRBcE^&k(Le}fSCyq?K+A&*P+UFm&*fS3#Rx(+3)|Qq1 zyuFRk%I2{Gqo}$#@*6Hd!K*V`6f?&yGyDi4BChRKq%l!73C>}qj3R1I9Yg%L4q*2& zDli%%o)2*x3;%mjA;QAKrWD#1&OtxrVqjVTgCHcN>Tvb=T|}%@NJ|nnkkZAB91p`p zRLB5Ts8-Y|ew+IS+X=EJs33$FeF;6Uxb~YD67S*LznLWlMz4Rd0WQS3 zd8qpdyBu{*v+wVO5Z4^~b;8+So|a9`I_Mn($Yqv5$8$|qH_rC=Vb|C&7*^GW1l{Wl zQke;>$iVL@1q|Lj??Pc$(C&{D_;I<^o7H3B|8^j(*2!G)l+8I4%ygYHkeDXQP4h2{ z)4*^23t3OgQfIjYpA)#bGgs+S=o?uQLrbqSdHo>^U8tsXaQFxAcWR1;pT`-()y&;y zcb*~3qCJKm-(P#4?{8m|x2k}@0)YuF*OH1kIfF}>x5#?FJj2i+-xKyc$E=g|gcS08 z-H#qZIR#~P5e*sK`tea=MhXa3ADltdgnEElXq1QvVrk1az-&^osD%f2Kbdl!F@$&* z98#1RRjq52p0=D^NPN3YG#hhKEnKha5DV*_{r=W>Uhg-;o~Uhq;M9!4&S1_l&tP?5 zkF>NrSle1#@}Nl>qjQfBUd&BBok9sM$-4t~)|*;V^qHJSxwWCmHWcj5No0%ah0}VJ8rfCo?u7IIly)-PWcJTEC^99vtz`1T z64Nw~&X4n>Yvwia@=Zfe7<1~kwV{!XNCRV--kO`&x06!oBEPGK*l%A;uOKuS-Fbb= zQpQMK^<|cHkmK9eU|~)N(;1T@QIqz>8J}`DPLDW5_Px&iu?Y@E2b8~$H7VEv^<;ZW zj&W49h~ijqAHER56z)&7_;Q%*LNj-}E^P2^-Ll_EZt85NepL&!?ZPQXzKn^esGtXn z%0L*ZBqasJ7Y`0WYmWUW5`wJ$t}#$tj8n7e9$h#>1hlP9zRvj#MP~Md#Wi#1Wmji% zX-i3GCh7s_t;|YU6faF9|al;8%%P8TLfDf+1XTkIq~pK-NRX&Ju__ z^6#H*f(k<(Ir;vv<$%1ZQ0jKMVf(9so3(By5|TV$q6owX$)K`a*r`33M68rN*uAU@ z0O*J%F)Be)se{h24qc)eAwMc2sAXCW{^yk?OI`t;#4tRKtqre%;21CJ(YE^Z-2pXW zkDB(dodntO8v^s;UpbT-x6CYW6eO?eM z`B!G8rSopOx|d#W?DVr6x20J%!;HudFiHlT5W11e-Pz&9QJTq-*>agOV%J)}2+pVN z{Ou@PY*ssB!q;pEpM$K@imMK8dPnxM9RNfpf;psV~f%< zi;C1~0Q~mSJwRpCbM2u%ku)*9lUq5iRGebA;J#73IO@Z(6iW$~31u7z`lAU?sX~mUX%2 zu0!|biE?15=yvY`2|VRC|Gu+5eY6F6QJ(@LQvI6@E*& z0~0bNt|J*~gqDC=QE#t#*6K7@j&k#>Kf#NBQr@K?_^{8*ci>G`P8`Yn&F>MxQ7K@S zRe9p=wza$u6uGD=YCHcG+ABYbXQ=%|#h?4;#) zgF|3$qIb@P+++ zt+zVI<>sJW`lgp6f|Nk2@3QOOx6IkZ7vvU^t(SQ8{IL;k(QI8?KH1C`#^N7lwV=-! z-o6yTo`8$$E8&EKRtd@D>9vuj%eeG-ZP|L%(C%Nf!w7DHSE1lsT(~P-!?DN;a3tV= z#39E6ip32KO9G=SLxiNe4jVL+)Ge;358{(JLW^K;VbesMA4sC(l7f=3WvpN{ccs-d zgQ_B=PE}sZGKKtvimEp<2aY<4rU^R7KC@m@-`@>bhpVw+J7UyUo325WdD&>x>LeyT zg%kkayj>hhin0bt5tNtuU^5)qc3!aN8Gpt0Q=y?_{Ib}EYk0g&N#h;Y-MMRhW_wAH zT~}>x>1IlJiG`Jpt>vqA35{FXeY*$4`!X>e@RP-w*s#hSe^ZT+zqv5geD4YB1k)O^*5tGH9@<)bxkAJPs-8Ly<8_f{Lgu zKW&&6jY>s94UEpl>ZGKef?Dd(u52mxh#X-V6S~XK_lsls)lYXa@?!E|&XZ!pUb#jC z=eq6=e-YTCROT!sye2nzaT@%x!v^>_98NE6;WIW8FAs-Z{utaifQ#C(xo|WCT9*6Y zEPh+bHD}$=^NU2#ntD|^Ab(DBMh;s!lSU6mE5rw3QnZTO$`~=w>XULGYK`875wdMn ziN0p=3tGV7C@Ai%`cJ<(K`=7N?iv)v<`v;%OR3f5B%_4W_j0OG3li|{2hy$R0q5TM z&26>4=G)YLQD#%h?|(&7ohCJQ=1usgX}ovJsna9CFXC<5VGL1}Q8NUWb55AV(sY^mo|D}u=HYO7(o~EZ++^{`A+w0x-eTLr8g-oFNhxin zaeJbb?FK<2){&EC8`~7TIxz*ag5u-txQDIH>8yNE$$n`XNpxZ&p<;;qvvG6_ zLrYIC^)v68e9R%%prO%sv$d4v0Tx(p`2s47chHwuR&@Zy(U<>g#4y}t>a zjfutTDUFblPLdXKZ6cm$9A1lxYDPiuo;TG8bUlAwYF)Hj#YZ#n!({=v`d>9#wJNMDnM)JQl>+L2L=ZiYqx%0Xr+h*Srl-MsH4nsDoxM zla=N{_k4OmS=^jx{NBL#Az)Wj8FJDQQ^O60aPdkA?TY=`S%5wn#oryFBC5x)_>In~ zD00?OW0Tm>i;j7`INpV6s4*I3?&aILnNiwIti7)U5z00ERErLe3Xc+xMC4$9EUY1#sbMxP zQ$8R>ua6ZjETXm~$Yr7+i{V~qhiekPiOZhKb+=i~c^xKKjdm|{2sZugeTtUizd@pT zUBB`}+bKlFT&Y8F2{B#%$;b1N;;~8~WH61*{KRtSxQ6w3X&_wE$H;bKZ2v%-oSwi# z(p+^yRPNVFtQ9njWbgW)!2A;WmAtFdGooO!(%gdL8eX*r=8y#R5#rjyQIe1_;+WXT z6n~!j`G%XzC8$kuT9V`p%Q%%ET<28+13_bCv^C8%N))-C(n5>QkG@Wy49Z@$Ja6Ga z#a0@UNqh4~LQD=jE&J5dK$VH=^TJu`fO^wKUefv)oQn`sN|y~HCt6u>O6x0jl7u}} z3=D{7R?DGR6%owv{)H(}TE|JSC#<%$b=AjWw}fdqBze1zs2)!I0l!zKAK_)L!Y^&a z9Xdghez&KZ%vx!d&_#cX^M=QAIdjD|G>i(NPnQFKyfvSILVmY2^n}v&M~p|o3hr#= zA{DU5=asF7kCP)WSLeTC_WjEo)J=8J{7N5l)zRynBMD9a;=`LIFi)r&SZZF-!Fx$h zPxa#2qu$F*`~Gsia|@_6@93VXWR1iDNZ8vW0ai**3+v&;gKA+H*3to=OT;RCbcoOh z6%}PDiM3^I=EBaS?tb)i`gX@&v3-%83(fnHi@gE4$UsN7j~v3smk ztttG(l7^Me8?!_!S58GF165HC#H>7~@N;ZJs&843$S&900H4!BP1*AM#ME?PL=Gv( z)HVWv6T)g^y~vXz(zn9DwN9Tx=if~Mb~Nno@gGERa*WOy%-N@{@kQ@XQM$;DZe5=y)_QT*^q z^Z0nN$>ETcZKuGKbfi%_+RPtY8450DXjTkSxce|TERfa@mh#c4D2WjWnM}y#f`pZ+ z@Ie9A@AHkZc3dCpesHiN$yzHL@awQmp>5Ap2PA+>WK!R`ut*u>gt0lU?lPpTyki`J zSX0aE6Py!bCQc%59FJM%x&Dcj_)*ERf%t*g{PN1M!>Q=G+E7;b(@{vR+;u3DUDgX- z`s)`_blWV?pL%3GfSjAoEG4$Q=k*&ecjqyv1y9DIJ(~ps>-<<@?U|h$9ABL7V3*ZG z(}h-kA>1CHL#VCn&ML@KYAaP<8!7N1hR>xG>7s#NhhP4I4J0X3R}35fR~1 z`|o3v&9Bg5Dk>`8Cz-OEKYv{^=3d66sNu7!j7C0%#Y_#xbSOq8iBcfHVuoCwn}^2O zFq|ULw0V3IJQB1j0d<+h=YNc zo~!*u3}`>C#hC@!3e8JDFdh-r(kkypWmqQ>%*L5`RuvoOa$9WE=eZ##Er{Cl!L@%7 z#WuQ~@?ojNIp=Eh6Yg)cw8pxHX99YUT2hSXA4?s>U}&BU8j^+|NS`l*g3{_Rk)}@- z>bM203psw5q+Ha5`y;J)%2kUN=yx!~ePl=qY4Nc*mm$aIS}oc_j3h}2oTT|@#t|@1 zp@yH!KzGLzihA{ff(c?y5#KTyCSl+a`Vs~)6VnXS>R4H2MTGs?dWZ!@vB}qN5e%We zpMS7)OkZ&}cS+O*L>$WICo1C1R$t;HQGQ5{QYtlJ!({ zWHhi47BJ*ytllC^eCE6b;_;hFwhif8l~< zIP5RW`dhD9o8(YRDMcmgL`g__s!B)&(e*tvDF@Pe*gx{e40Te30)SHLT(CIsGy0;) zCnpSVIiZCf(rDJ(yd`4X_owdfjF95;e0^S@T#X#6eS`j(3dtz7sd>w~u>opy(os1a zaUQi#3QYJ@T!Ghl*lcz{*~LykK*0|OntKFiOFpWp*^{-6WQ)aC-@mIilNp~#lrc{~ zN)eY4jA?f5l-)w>^2qUjdvp&!p{~{@V+80287^nQ7z-)X8ZF9-agyX5s}euK9EdPH zI=i@KV3YJtFP~!1DWSp51mB*s)&#j5jrpns|DWa8m|tco>(OgjUN@ z3e%`?AuaRbz$oIus`E@GwT!Mm8_K`cffJ<9ABK4$Y2Y@;)VxOfvq22z)J#a><>H*# z+*B(tkAEZrLev;ZGN^3U+VhE<@Su*Y||6+L~@hHUvnq!W(x-;oF(vz z>L$nAAC~@}rxbZdD+?&BDJA`(kWWN9SBNQpa`f~Tk2@5U0pUX~s7MpCER8QWW)Koe zq63e9%|;6)sR@rVdtUKA=G$1|=(X6xt3UVCN(qd2$;+bL!cN7R_Or3l|NMXRm>@Z&p{)2ha?@4Zi(_Z@;BUu+O)I=U0rejuh1 zg8@ycMFP6Ak-EHW9Qi@TUo6Y`^otQWNND+$UhYp7-tIfX;GiONJ<1uzM{NE)gd~jM z+J4cIQnH^CvcBIGp}p}UJI{qyk7J#E-r$xg4y8wk*>3zL`NuisWC%No`RBkC3D$l z23EbTqzA=V22bl`QT{sA@hM3{nq5i+^o*26+hg6!cbD7tQS+cKFm1Q-;15s~a1@#j z^cSYEtn6}@N8*Ow8_)7WHej^#)TGAoDHBMICGG6d2G1=qXNvI|xcg=|!)$!9smZvT z3`zOanhWUF*sT?0CH(S+~MD+pzDvo{g1a3V@)z zQ;w8%u1wbG0Lq)Ywg*@UoM2|=axY5Tt#wS#VV>2d7Z6N(z6e!mNFjOcHpOf@5=uxg zVfG*>s{Id1B6mbb3@u$sN_xTVQTe}lt36G||u(e>&1Pu1TUewapkBn+%bcrCgCfz7;|lgais2#Es% zh}K4Zl(I@uVcKHQ6g_3=^-5XN5zC23nVw#5wk3mX5#Em8B3dY9H^x{=`JDlU1fx zW>hp?*5(&StXWYStz-l~TRiy5+&}<*n#6P$21OD{Xf$zP&b2&2I_%zu6lVhcbhrvD?DV^YZb&>~jg^3%7TyKdd zN*UclI7Ai`mCN6i4QREa3)vB%a=eYazOyqUkuO?a%Nbrh8jz>|6`IMIDT+o>-japG zhD0QUHrJm5L}#Lyk8}4JAh4(Twzjz2;L~l)Xm7; z8H-dn*L6exx-57YXJg99NYoZMixsbsm8Ve35K%(8)pJW-dw}6`$t@H1)3Z>?G(VVC z&MC0ZiVOj-20d8O#$~sRtZYx;C%3Pa-yO*&5Cv>wZ*eF!>>h65wnhwEim^8*6kc%d z>%dG@LB}NlbK@x_K2nS>N(-bmAuEtV2ct4?j{-VSDRmmnpU1E7|y8EXUh zQX&-+QsY53lFRDvzcfWno{zZWbNGw;CfC5pN*+Ty3-Alb_6&^dMqB87Xwd&)?N66e zF*h(*xa5Lt6Q66l!6P!jr{VrY2!2}Con27skIE|XrnSW z{)5m>xS{UO7TwMb%kdJ+C8{`BhX(dyQWdZ*+1NL^}P^1H8q(=SRP>4* z{H_@75hBCkC`JTFwc@mvH|m7_hU>d1{>kl*j}9ml1P^P07Omrt2`XzbCHVA#tT_cYdH9I}YXBuHO5pV zn62)&Iu&(9vk1jP^Q^Jb3#{<=uCAeVZ7f+(nfbA_uz`e=t_db_%g;hN`QvTEHI#CV z?>?dS2`^3@U&o1Qtpmy5j{u03NqNTs{w3iV3IPRi$_D2Fj;djSJbLc~D(RO73W0es zC{0w-Y8mnhjh6S(cy`R~>}JJ)+E#eMn>UP}DHmjP3}tgsOhp?Cnn^NL6`Me}l|uKT z)X4r0FKOrdXK#vA-R@O_}}%paIV{*7*>xnp+~)s3QDh! zP0SBbO|#wdbU_f}Fz?=j_x_`F)P39Hglxy&hdN3yvHRc+QqoceIQ8wrg`=!O=N)WHhPnQzwY>k18SU?1>w3P* zCPQjtSA;=G;xazO%N&uQT(lM)me3_!>yS_`-FC~!{04kN35oPSxuz)Mw1}e{{6lfZ zM7K;lRt`@UcEDzM$iK<;3J=DhQU{C^jlQPsS=Gt91ybhxK4`%=Jo$6CQ>g|`!g}z6oXKz-+8GKdUILZ24aE=7p!rdv0i==cZ5()`BW$?2U^*$sOX1 zLHZzryBCrDxg}~S2*9+#sH%}uhR0GLt5HaW;wdb#;SRTsNxBxt7#o=o5yi}&OtO-} z^dcHDT%CQKtx>Z5_!#&>Z?26abkE_>BuY>&98qAjYA}pwQlNE4Yo{RfZ6A!?qxHKm zz0JCY&-{Bn917L6zz93(-}3NqgP<-k1kyti`fB8jE78KLssKvltO2D7Hz_5Z0#8Xu zg6N@w1+Us(E+P7&F;pXfN@8$I0Xdqw=>A{y;vr%(5?BXClq3|zj=WZU&0v$FWYTmA zV7R8J#3W5XYdf#=*OjH^4MILG?R*pQP+VUjSsdMv_PxF#ECp31)N8e?`5A=T9B_lD zzKzv(SCQYRIW@HCNyR9c#MIfRRuu;DidIj#?m=&x-u|9GD1w05<)gHe)4Ly^J47*& zUUsunR7G>Q=Zp6I+Ukn9%GnS3RivxiRACuZ<>g_Q*#ys5bI;O4w~pVo%XbEOw+`2M zd;703abU6Zi_vTD3Sird!FP+3$nl^nt_9_qa3OPzik7!o z4e7q>UgwR)35!8W5GVNXLrZ)e6@i;Zh74Ra1h#BU6|oqb#Y5*-epp`m6)r0jGOQzA z{cv%dnp)Hva{mlZdU!BXtB6|uw*TOmz`Phn)MM9j%QsgElC?Dqqa#us)KJ`A?Rnx2FP0LQSo|em|5{Wr@A_(^{!RbEX zuMykpUE>;;wP5GdmX(Mil=i~LX@VeVRZqKO(G2#DgAQFo-ky#@dpA}T)8>0{KD$;N zJa!lvx62E)O#wy1MJB8mIHToDY_QM6hkJ_VkcTl(2R)@V%g0gl?r)^%es3;d7x3ue z_iI(t-at0&Y{v0T1gYX4l3;0tT1~0HZ^-Y%D@SL{B{7f>sOY)~6AaV9bFR;A=m@A0 zSG72`cp*JKlis(VioXRX+%CWCxRt~nKH@96pXygfAgs8{DARp5&*MFotfy^QQMjio z?E_H_H|jN`@8+zF?jWnh!jL|_7a12bP=`C0Ar*E*xKve8Y#Dpx#j)dKhSjAJ-7Z`~ zqQWf8s8!qUI;2#u3>lQZv@l3GaA8vt1Sn!8n?M|?QabKP;-leLl`2|b-T9#CwG5)C z2fVt0Ih>v1@pco8y7yeI`(h;OIy09?yJmyC{PBT2%z_=xeu5WNgo%3*;``Ja#y(j& zEUG5AforJ)r;}WoS7duhA(B~cR-Vv$LO==+$?&J;@o0I?P2;9EWUe6cpdJztC8;Hz z07uk+d!=kgwy11Au!<3J%y-$*>x}sEdRW3df5^o=J^|ehq2LB0jwgKn-5C$ZVvV_p z1>G`Gi`|xW9!P8RosEBbL51MA3##(rc;Bm@UjP&3j8LzA$} zns|&#O85Op2U5DDUKj_)EK8L)tj3!o% z09+buu0IonM6u4EMX*JY;PXgCr{d)|&kc4+NR|vNo8i`-SM9H(9fI>Wn@0QJ>AX>< zQ;o>SM4`$n@63Mir5P|T;?!jCG-?z>EDSKVX4RnrAKlS@wLJz2+fA2?ppYWtw_d1W zJt^;jSqHoy9@);xv&nIZ?NJ7}8H1gqTzJ&Xu4U1jg*$UggzsKlFz0BwN#S)!l4<;T z^M|5fLuf4NW|@pkkjwK3Z1hmBTChSxzNb)b7d9**_YVH8Wjm!wn^TRIDi`rdyQVZ) zB93AljGNxU+}Y8=sOS`>k#-9X!8)N>__9A-wrLLhzkeo=2K)Y^DT)Rym3mfkyzs+w z;MFD2Wg?}WUoWVpX@_?X)UI7#ecRW2&U#vUAG}&h*)ZpbST0&BMw}acklRL+j6SHO zxv=^B!Lulaf{7n~c-efs){XX&y31X}S3~3rg60&{W!}3=`0O+!5H&L$$6p;pxX?cr zL{fg7=FIDk0o#I!Z^2IjKzmu*LrF$At}}LJdWI0$@7?v{dVe+Pb}gf4_=YD7v1&Ow zpeasRzt@kjTrgkYj*=M37Yp27n}8{VTuE{z`U}oV*Q`pwr$2Oip}5=MFE$GH;~gFQ zW?eH=SjqA0J|7iSxFgiW31|82yrLkS^g!&Xeunk#PFv}*Z80QOSB6{{(m=z_)t!6M zpKB*=49DV9mqx88byDh#(WsiB%t+)h@u?p;Nc)(OX|2v& zT5YOQSr*|kAkMyIJxIEUgg>4kOteNnc^@r!__>; z&8sUx5Hxp}oT+;{rN;(>XN|*XCX6lv_(4N%E8OoGaXH!pD)bCMw+S&+C*(^B^GGcZ zUMR=2j(#dFbgX)>;*A1=SuP=pjTZABiGH_m=KA~;62%E&!>US@_>J@7jvY&tf1>Kg zD68L@QS!ZpBgKNS(xE8ZB5XGcFN|TT8lbu2>#U|Byl(XMYD4@o-xxTv8_50&$Al9E z2A;+9>={qZ4Xec+I@_PY&>6g+-}XC}SD9!cG+}&~eqPZ}cM-9v^YNju0%;PxECrp) zx!D6*Nd(}$b|EtE9ATOJJt!?-yJb}qaECT_a3;xwfII6m#bY>wtV;FvMo$z6#(xag z#})N|15|XiA@Wg^XKr>J_?!crlZg zocf#YI^6nPK;rXb**Xh2^P`j%f}mp}BgRl*FFkMUjDLFy>iOj9vRcF_JtHIKaG9TK zyuL~hlzd=aMdnW614%JR$K?nyW1yXBL{T$9h*rLWuXRyM04xi>&{<)!xNT|7_oRvU zc=53zi!N9qu}3~Z-LBp*IL?P=V*#019nDO5QsXomSjCdt3?Lc`UAJ3yL7?#@zWb|$qUQGVa!u6|n zMu`Y*HFFWwKyDYBW1jlLAKxbI5I)TWTkVf2$x2+0F9I~rhW%Kf<~a1G9RB2XFt@Oo z1|)RAUxt0<$z8<_WM^|-C)FCV_RWckvNV;OT0oAfwI*wKzII|a=}Sd0&fn3pzW4IZxi^|7^ZX!}J}VPQnF3oC#U zN`xe(peo^Q0>bOh6-r1gA{(&thojWeV=nl!DL4BaKF=uc{wh#Cj3T5j;oM^2dtx!3 zQMl-PXoBie7V#0{OOK_jGbE&ZG*1}^!R2&fux~Z9KP9$IgW}90U+OLrzTOc~ zI0+9OSxHmjXA3zhiMvynbNrSX^k>w~-$&ev5;Q;!#S(t22X=r#&K%2R1#CN#T=YT7 zR7-VU?Po>`Y`gM6>u3uFb;h{y$Zeo8vIsE)Kic0Dh&-`jRj9+h{H7&Ns;S2=t-arg z$YoY#c}}&D*FN`pPSkF4@1`|OAlK4N6w+ph~Cn7yIqnN&uqk!g{9z9|=pvwEyr z4DCIrQV`t!Hn*OKc=5^fS~HqvLdyr{`Dd0ADJFob7gW5|DWil)$z+$wn!g#@H9nfh zTudO|Zt_fekPqqN6dtrlDQXkmHOmNU&tyz@s8(1A)QSbMC3shKXCtPZXTdmSwqH2U zTrp>?_};#bS7e1UGR#VVo@P>t_lSq;F=Df|QK4C_YbdZ^wXAh0^pD-hVc|yfU?uFy zBM^dZu=2~xejEAh4EioksE!*nB#QD%^3Ctv8$09WG%h|lBqpDdHPig0!uR1yJg~sx znh9wu@n!F6$m^HQ3<71kw(9I@yUm8?&Tvb);oUQjZ^JA zDPURC{wI{uCpAh;1rq~R;QSuyf$<>;6Dtw7g$FLi9}HYxyv2)Ac=wDlXza_tQE(&U zJV?o#jDe@Och1%;v?xUZg#Nk{pR~QQw(`FcsWoh5od}Vk*ot6oo3TKVaGK2F!V%_a zE}1a75P3Fb$ZI+FfPx@LWf`GG_t8#%?C*XGf3B*Wqa~{A1+v3xB}XGR5VN+>_I>du zw*{yoYfnp@o{|r7M?!Jd&y1EnJY<@z^ApA*k)oK#$!HwuA1?U>wPrYcvg^#x7CmKA z8`-p5_0bV(H(`V01G|z!gN|6NJ@P?MQoV06Y)VziFul* z{^7@-=c|$wSC40wmF3iBF=BSVCZpcHek4%DktwLqYpf?m&Kb~T;sB{z!qr3fn520| zW_svENdCsr2RLi(S7X`VyMJrkzZZz>fhqX4Uo-r8HUtNbJ>;iM`5L%!_Z=#VO3|)b zXh}$N9?V)8Va}?F0`JE}#-kMFNQg;9JRysaH$+wVkc`LC8E9{)f=))tOr5-h+#;bf z``LT1asO#BCa)XGS9QXJVwpt*S((ZOfLlbWiq9y7TpJzi_+u77F(5NJRZ4tPRmLq{ zCA%#pFQGSg+msqAPhu=T6zM}%LQ^wRHeR@6dW}+O+*LJ-7u?k{8n@Wa#?1n25B$i4 zED3|}qqj&7XO2BV9{O@}EOqp{A|fWzZphr;7{72xgYL`ey;o?8=v(9*G2|RHA^%@g zy;jhYL;uO$F?e#XPa-D@TDj=OWhT;z&itQLwfSp5eROqzS?l%kc7>v{e3K!ml-RTy z@7+H25WWxmp-m*H$;(E56A7vY(U0ti+s7Zn$W)KX0rmcMDE>Y25ay;@u(zs=SEHE+ zI=b61($wt_nV8gG9~-Z{ft9&V!wZsrB5vP*3z|x}s;Za31cn#F%StdEOk#WroV^^N zTrG{x%ZC9twfiz|-+qUO(dls3QAT1;Axx@`o$F5n@x(72{ik#lu*JlLb8>d6a`T4W zCXz@*WGXrgY=uSJrq`H&@^bkL47pswp%p9cN-Id|j9ofJs7;0zxgbcQ_B66THI&^n zYObfb=HMcsVJH4seay*F+^mU8bxXg??kb z8gx%+&ikiV7)kLVImcRtKtv#F>uSN|7^|&rV`Fgs&@G(Zb&YODt6GXLkXIFU`=6Yk(kAGtG z_RIKf%`kx+-ebJm?T<}lA_92$&X{U*uyKX>^sIIM@Oup#%q`p^C!O56ASTJb$ zX+#cVn6__ONt86vo}sO74T55<=dSild-eDoqk2E#l-fdBS>^qD9TJN>&pu%MB5zo< z60Cv~AIYiMVxX;p0HWqyy7dM~KuKG{uK&dD4JIgxBiqg*FO6}HT1A?eZZl1E9qhr_JU<6P-MjP*dzT-kk{UHU z_BrVvIdej9G~uL~s5LHGO5XS*s^NOiEnn>hr57?|6oG zsx#iyG*wevdvKvpf?^C^=bkX?CpeiM6fIm#(bKzgy_r1z{rwCLVWFF}P{m^{Q+2zE zi-x1;=!RFnrtUO1k8zSHN=S=~bsOb@QL_fYb7%(`nCpDJ-4?lJ`fSr~+Q{w&HRlhIt-Hx8|K~7>8hB;{n8f?=G5t5jT z4qaU;?K7E~XwjkJDSoNJdc(;jP%%)#@$)b62PH%Aj<9vMf`e<#^CU)d>_z9Uzk;oY z4d&EAP*hsUOJ$@CT88bSg1Vf;56i|dE(7g*3Xd(lchQHpX`^qYEi-shFKB4jz8dOt z%a^}xXs3sX^M^Ch6?GvlG6~PF@bA@RJxi@-lsJ_%6&jWPde8c!Or>jWa}CVdFcA)| ztk9}kbH*3%QxQPa=yemfp)k9c8svu)g1Yv(#*^q*ASil`Jv>8XXYUT5MXdbX(5ar& zn@pOvt(rAyNl!sZ7GA?$5f&1~l!mA2YyAD9v3{FivD~>+yFx=(3A^q`Gf4w~@-l?0 znF{{ie-AtM-{Ph3{Vbr4pyy6M$7~Nn$W=@6d(J-7`nn+|G7kEtHI3;>T#_Xtg5+~5;IM@bn8*9i-UF&R=5;*y!Z zt_~`lJ>wpuZm5E(OUJ;zon@`(fB!lRq0b}v`%feL$3-X9yS#sR+ZmMPbEJA*3r&$( zt0y8QDGjZ?nm4%5ZV?IY{)wk>>E0ZRHqL-JkqldOwio)l8jH8S!B{YZqOSer49l8p}!kM)+f|0o%nmVWo(tVUWGIF!w*~gU` zggy^Luqlk`A|K#uT`fW5e zZM}qXZS-KHB{=xsCFSFJR2IA^bKE5u2j1asAAPNZn@>XDQX3hd#wMF$B z*%|hoI8K{%DR%0r(fFRw+S3tM?aa}plQZnytQewNy6zO;MrCTKfyE2@A}+54>u*Fe z(f!@5RPft?W{}A%#Jq)vap=So#<%HXaN+uEY}|AK<63H?b2C9nIlHKg2{q|F#1-b& z#!L}buVJnq{acUhIET=e;h3;s7?ar{GAst#1~u#IT|9IXG73^;zu`zuPJ_5a?Pb-e zKh=Am5b$iKZ9SZsZJNWZnS*eWx_0~yQ<0ha6Y1z{GPN6P1GQcG-Uwj)??%q*4?RO| zBqgNQtLh%zaTY~6#Zb{z#I)sOp{OK}__#!9HL#B0hS|Gt@$Vbt*fD5F#sKPU+JBlUq`$At^8ICsk_-SEZ{JQx9R$hs~LQg~JDN5q! z?#6f$nS(QT{Biqk06Kfv!?U9ebhYZY$u$Ia1x-g2 z^c>O|vC)ZW+q-3*gi#mhD&O(V=P`6fU#M$xmiOg_g#1t&$t9x7dJ_WCA}X7jQQdZz zYbPI~Fsle=PDU6wx;y{r?&QtZLsUe+9kt3Rvu!?*Y_Il z35Ql3V#>|UtMfZiL-+mDPa&%!gDF3af|`a3arPN3`4=T9MzZ}<#X$*4QN5J0kke@C?N`<{2|uI%C7j-O z4F}d7hqRP9rcCUNj%_*ilCY>0Y#>C1+(OLhWCRQKN^fjhmg#nI1_I)9psK9|6B7-X z7-~XIMIOq^vW)vcGI@A{LK6@e7z^pbVsy1uhnHndLQ5u68_h8#MvkApY#bw%wCd_8 z)YYINph4bKyDni;Dvz4RS$v`2$G~1kATT{6H48I)Er*4@8HP^i!$>Ty{ae*}?wNg8 zATcZ%i#AVZq!k`yHD#c0TGRV`R{xbK%`d^6)svXN6H`;+)=T&`_N+g~q`{#!!$D(v zR!Rtp;xM%_sP#M1d{1}(dyL7;M=}{OC_;lH(PMPoSwg69*s2Mep{cEo(K80~^N7*@ zcOTsLlVfBz{kR7=4?Uo{A*e6K(9dZk|7wyHA*k6rf)bMy!<(?6vk~?< z8LtXULG9~wysOAbz(_?JJ=L?@=s_TU%i4tRYOH2akbmAyrH2mj*o? z3-*v%m8e&fDoOUO-)tr z?xkml4^G6wjng>NB2g9Fb#)SY?l(R`xUl;=T>G^aASn91JG5z4C(%4O`xFBw_uvy0 zg-siay6u8K@3H=e?XYiU4etq+Su&{I&cIsdlL(6DyI(zJ3*OylYA;5T)`Jk#`!79? z#lHYSQJ36}yuSy{?jr z?!))`8_fB25+tg`@#W0A)7qLi;ONOI_l~6m^mIb?E;be*-;{~09Gc&U5{7$>q97bzTC$#r$ zRgE--lVNCJG!ry@eCs(_IhzX*6hnH^@t?F%K~uY|ow5yy!OUtY^}Fax2rBt&jA8N5 zOi=1vI}{Ve+Omfu##affWbti49Sc|dH5<=|`eX09BiOd>8ZKV-!>~c^FrcS1+O#yo zUkC4C;n@&$x6?!)JJnh%1lIy>lfgm6Qs%TWwvkAI15_7&W8jine$i-sQK>JVr)BCPuFkJT5c6GEd%% z2LZTt><+>SIi<#=qd2$HiT9mPFisu3%$MA#)34Xik6);Gc=ZLg&fClLRR52(v0pQ< zyZC1$C^~N)B#lwBb*(h9bmt6~A)1&}McbvLcX9Eb8(6pg9P~|8;o)k7c~iR~C^!Kp z&OgJIWA9<7rwChJd7{=yK~r7=+6uzew5W_iTnbFbqu_KH*z04)h5?XOkU@4{E;kVRYSmn1p(|ex zPd~%7)ngb}gX)k|uIZmd4dst-`r*HAXE9?%O{Y_8Kc-m^>^@Q1Fo78dWDAv^({B`1VPL0JKQ_xzz_q*0U!m4bn1FNG7Ub)SFne^m4%H>B1e1L&Kxs|2z<3 z0g;FaOkjx9wNGpGA`+dXBquKyV*vj@_O1ggimQtr=~!q2q9CAHi4}W|-Poc=V~f4V zB*vB)^;e_D-i;bHYGRDNH^hplC@4*u^xk`s-u`!IcM(F7vLN6)-*??{S%#T6GiUF+ z@7{CAbjh8TUjNICP1l~o-yw+V?`#gy%P2p?S9^_+j%bv~1V3z-LJp%A{{QSl+T0 z{(s@bEw=p{I!!e(a`GT_9@>GC+WRU3O40ax^97{LPRrNYJ~=rRS5KMZ^@BHp*JaRN z5A6mRBa(TS=9gX|A|wh6SBhl%<6;wY_jT#;O~eKz2nn024eKE#Ayp9Edu`?*1p6n8 zNyppWAID~#L2_&gw2kUx+KR6+WL)pu0QIVgTk-0iy+Be#P=|i|3xV#TSh#IE#?Bg2 zqH$~0u^HNSYh7xBx_IO^&g{M>%NYgw)k-fIM5|(V8!=L&Z z01a(5!Kwl-{&5Sk@-i4Te=xpXKdso$CbMZuT1q88ML+A@h(>@Aje>-e7d~5?? z)Wr}}myJXJ(LDv>I$8-Nfw_OTUxDotCrp|*2Cfe7_5LV-7vvMUYyO!b z22bjXs|W7l^)m-NzWoB>evw$bXKpT$l?3EzeY2$GqLTEbv}4&3*xwf%CTiC;myy%e zs&M_wMN(p=&2^Ih51TO%jr6to6V4$u&=k(Y&Ks+a|A0Qjx_t2SzU~2t^p8PYL;|KR z{|a6Ebbyz;Uy*In$cqeTo>4|-7S12Lj@S1^-c!4Qtr>xQTOhx01|y+M`)?sCJO;Lm z&>oz(fRd^b7VVl@prV3^f+UQ6+ye0Qf)yU$v4rWFN61Lc#JC?vVE&rPg&q@mgG5sQ z?7Sj)u#0xgC>24K7LRVcz_InG5f>tOg`}iODh2^n zmyInlWD(#UBE*bN?mmZ5&j?WCgLTK26kCk0)lifud;ORhICQXa{JS_m|x70&?cp|=>KLiW@_ss{sEW(6>UtzyJnqu3TRUb^V zq11-}#}I*_=9f^5-16=#G;CQhd4{i^IO6!0Gw^o^rvH+TJ9UWJ#|SF3s$SRfxIVdQ z#D?wlg#Ke3(g$T9Yisbj$C7|L`%rd$pm(USTu%v_{U;RPZ;+BjhaVY$zQaGPEO2Dg z89_Fs{UBrfdTi+j35s%!&h5V@#FeK1Z$gO(>iInz#Dv6Q`{gx7N)VUL8xO-D8&1Q< z%u!gaWjM`jB_pTMs(xL|LsGd>Ov7T_V~+B{#^Md;FPkcqeUw$+9Ax?&yOzPh2qO`!5P~6)p*B?XoL*L!>slU?%1yIv(vp)@MSzyDW7&g7OWo1oaU~Wy`&b~klXnuOuj~WZa9va_rHeWjvaPpFjNd}gJP z?^Bad+Vk;t@yGlP({ei_R~AU@?%ctfczE$C62gSmfz;D$BeOrNa@?t+NGf;9MN@~z zOezAl7LL$0(5W`Z``yMFSUz?GR*v6-1skSf=!CwX?6s?Gc_NzDk*xCL%a)RkH`MKx zB$Ez&?2Pe`Nm9A;iW&{n!6ALnYSri|L#xg4QX=oxbE~jn-d^na;TSCM+hFO|IaRVd zP)&iad!SIt^74)?QsPo2-9BcAYYI7WlB(jM z%6-E&jdQE*mC7NzTiKxeqm_FXqU(STIPlX69R1}q%+8x*#)_|nf?@Muhow0G+UVQd>OUzz55p6oP;1{|SUX$rrbJ#z267sa@kz+<3V5#Y@ z86&6cntcA{At@4@5mxx(qZ4^yu@v#wj*F02lt;JEMZwqP541k2%IMUm4KmZRFnDxd zm|lJi$ER)C0F=7h&RK}t_Bo!vN*UrPdGM2yLjs$3(*H$`iP}aL2}{dtuN) z7hg>5j}cS)SE^1`Rm7_&Z-lfHhiA?RbpxJ_QNadg$c_;=?Om<()( zk<$jDL-*F7%(UM6kpt{4o#A2YEhsXGktCkZQPTrcMoej+@-tB$k|K4iv9uBDAHms0 z3Lrr>lC=GH!}NTO0px0kxSO z?R$NO{-b-KW6#fO#JN*4uG1@bIJ|Iyn}ZkpTmldi7@zkekyGpvowb8IhnSTT5 zl)4;{_RC#nFRPl4Q&mBk8!-&ko=VFzwEQXP@cd!Ag^y+#LD|ATj9Lo?d3pS@v!FbO zs-8-fuiGmN?4P*`GP?CO3(6EzS719!N4Pq8z{fcN?_!b#H32>2hG^Et5Y5{*MY|rY zK5F?5DGT_z2f^PXNFbsR|8PVFMI$CCUZOOR7c^yi&Flq4>X$DVDMj(SzXC~0OjI<_ z)2gGPjH_;c^3g2gx9;)@o8V&;0KL`?VbEL`va+&JP?Q%MU&<@Up_Yt{kb*$H+Io-Z zQlkpG6XTQd?8Yk?H8&`>3zmnA51iPv(AOgnq23WlPZoJ4q^qxEqzxTCX8KytZ>}ed zA$e?Nl{%rWFCqS62oH`DloVpZW7!u;oY2!QIy?rk;cvmg*({wocE) z#sz!v`p%oYH~w2@G0Z5UN_8;Qde?{?hNAYP-Bbhp#`Y2f|I6BpH7qa+&M)0D=s!vWU^|ZK|{?_|Gfa;5P`VdoqXV7q`$wZ5!bqhHVyowF_>RUc$@;qW)%omvYQ$Ui}p_66J1_ z_h-g^Ti~ZX3rkc1;K;nE)VMT6MMM|b@~NyaJ8gkFb!(%~*e+$>2a(mwdv-{QNrs`3 zUYYhmZP_i%tl{$7O=xu`nK3e>?g$Tx9n6KHxai^T)Pb5VexTI+-$VS%Hs*PEzCt>6_0}A&t{3GD- z#05qz3=tg}gAM~eV^c*Xsh#JW7j96NQG~BoAcjtrj;55mgAY94_(NN}KGf99rU>Ed z5dhm4PVjQ^MW|N`K7yC!~|8fq3vQBn^{lftlLV;sG=wOS7d=~53E z)2rCFMrDDcwHvl9Jb>8X1o|%}?u}#*!wYHcpIrw9v3IqISsx0pcOKRrJ^GnIPC-tH z&tzw1Ls?l#NTie?JSqx(Ms|_vxsn&vw7EV!TzxQd_CTrbo4hOQhYpAdiA6)*2B=%F zcA54QAD0LRYZtgVQetJGkm8UBG!MVY>=L_1wwxxaIZ25hkSCz-G3>$Ec5PL)&Jw21vbTk&6bE}N z7kD~(3r#|E-pe)9>K)b2onzPR4X2FChm4f=b5i)1UDpeTZ&m97Y1YO7ZVuj3dx&-} zKZ-wAo{|t0-Q?XXn4rj$>fWHKCX9QvKy*+X4sQ4h3GoGW(Ei$c0or<+QvLfUww;59 zmPn~pPN8DE+fcehx4|7Sa@GKJ8_^M23fb`T@I#-BP%YWjWQTX4!C0KKKF$PWQ zgL+y*o)yg$Rx)#7WBZEP9AM3oqOvl#1DgQC z*rb_My-<~U_1KhBl#?VUU%54htxb$De0qNjoYV{IhH3~54uz+iACi+(OSPW{jWjWO z$`E|FekN2@l+eCyW2mYMF*}+UA7>WA#(F<915P2;grvm9kV48iE%zSX_;0lyl=N~+ zqnMt5f~BK22wCJ}Y-cutjZ19(IO^i>%zF&^R@|uk#ra=Mnsf?UVEAn{RLqbvhd$nY0 zsZCro8hVELv$m(q9|I-k-I!l~ArO^vT_x03uU(>!x+TT2&1cZuxQUQUM2=NU^R$3<1>#G); zkLAO>utwUVIG3!94El6vCYYUj03KVpAuuqO=CeOAGiKLdC@&DM;v_}wn%RXq%r=+N z|4;3_h@L}C%A5uLu|TzSsmY};u_B4r-+wuYNZ;ICoJY)NR^;?fOi+||aG#lOpV3{> zxmUYFKljzNVL~n7{WBI|WQKL8mrCszJYD=?dFwUS>|dOp>`O>eLh|M;%eUH)#!)>c-VU* zCM*uJ%mr+~yeL}vYh6euB$Jl}BH;>ulHq^D)k5+>!W{lh7xYI{-yYC=f)I@%H5 z(b&J{Bv$SJu38Onc43Zt)eI7;siQbbhe*#k6&ZBC>L^TVtEf>GL)bifgRwJ)!MJ_F=bqwivzJUl(3nu%Jof-cSDnVw+g4bzZI00MPg;0z`6*g9 z6Ws^3*z~VQ&lZS@O@gzHAN2C*Uumo7PCFZNI;j{)C7H93hlrLyD4|fu>eMxEfW22Y zKvkn|l^zU=M}5pthY^1*O_b zurYIlnubcDzY`fAgKa+^M5C5EQmqIePjcIwgJ{*QsW3LN5%G8zoh%{l!a(vSR4+#h z2_%u5kmtN;EpW6rWYCu4W^Mbl!nm2kp`oeH_F4E&?OrELagv->??=MoC+o;`R2H;$VLL0vI?nXO}Xwz6b93L_|5 zn6SKS3oZ4c)dKdd{}cYM!PtIn4cc^;p61t_0Y6FV5Q_qhkrhNrl+TzseJD;RR(+Vu zY!gUOQL{Ftt{4Xu)p}^4RUe891>=QD?~-xm$Tc{zX_rxF11#M!uUv|qPfcIp;*N{h zvuR49KM#*gK%`d`YAY!sDlQS54hoC6q$9abr&mDjtx6GE>@!Y@i9J{|2q{M6>lEh6E8?9!NSLvD8Q!2yxs)@a-?@LHMh_!Qe z!|k;vPCVU(V83wOIr9iFZoNiQOm5Q-KW4j_oviFm9&8M0t(;mMHETDpH8%XYESI2s z-v%NhzVN&*iV5vpaRllORdL*W7Zjw}b!OqP)$p|qpmAu<#v`}_&%FmkS>DEV>et4M zRTD6J`XIR2dLi5|0*!QaAcOZ=qf|J2_ULuET6(~sgP!13weDb4;=gx=dCQJ>t+8fF z@qCk#ngM^O5Zrn0j3>_pWz}EVIB-F=>PS-J8;v5h)T&36-O27L`et0bc{Un0*81p! zuy@r7n4K^eJUyD)g?UEijjK=<{BDRU;w!zMJR3Wz{=LPPMN>65(k}ZFmLAkrw%En}MC5(Zhc>1@3?w(G=+To{N3xv)w z)br87@-0;B)kD3y@6!aRMb`Dd?;(Sc$%xqlv0&}g4@x7421er7soU|}%CT&JAI|T4 zZ03m5*Q^i~C4>Olv9WOClu}JeN_?YHFo9aiEECZbpsc2Z0TX&-=8Eym z+WZYt<5SV7VFMv;AT~N)@Tw?9LceVz{J4E?kuf<^OI7S}nQ)DV&q-)QvJKbaZL;d289RIbpZqdvL~ z?1&*>_m%ookjgjioOui@GaJMP<;HX=uZh->kCF#p(OKdp9GiX2e$Z*IiM4+&Lo;Lj zg6HDshwP-B0!6%3JiK-JLA;Ai!uBg`q^fd2SozIXI6QEnF>1-i%JUBc6_ zOVx78%QXOwFG)Ja3vRYvi15lcGok6qL1qWoO%_-cE(RM9%82}%+1I31zV676MQX-N z)Gz;w6MN6Y!Q2H?SC7NAC1a&}e3ZnxpzklpO3%8>#%y>M7_WKkh3ZA+jJg=&*W=5e z-7tSDhUvK{xWD59{x`$`O6*JURbV3iei?#@#B|~QN@8R__nXhrzjvz=|J}%#cUZpa zU&KZUjrTgTv2f;8QbS4VKR4V+h}vTnS?W%;U8*|upr)-VRPrm=Q5KqLil%Vc!Z&Bi zyHvzSCn7F1zOc5|63>ZNi#AYEbXBgEyPOU=y28Jh^)bBz?%M6X(q=JIb9NylUr4~2<*J)(-cohRZ)4)y2Nu2C6migO3mP=ikm@l~SKld}zC&U}k{?act5V-NHh$FZi$n#rX(~5;`FPyFWC1n* z5X|nT1BD{IZ_lU{9D5!NpU7nBHPOKQnFG+VjefBNwQlPbxH$>RpOmvUgv+yhVkA|_ z%}2#p{g~05zb7;D$>mG23#$C3qF<^JyTER!kScb5M9LJXA5UJu>cc;LFjLz4fdka* z<(~qkrKMxXPlpg07>l2eEW&`%J*0lD7mwV=iFM~`a}68EEmiPdq2QYt#4*n4J#;a; zP5l!6?hWT?Tz3e^yYwvd>DL;wzv&AN)%=PMp^@>}vim0dJcZ`OG`;InSf50WPmH7r zxeN`b+sr1Cgi#?^hSwj%G`%l=+BLsIj(6U`pAp~`LSwAQ#x$%7f2q-Y)JRud30uB2$~Vm` z4oQrlet#N_*S>M6!@SWihPKDhfo-9xQX6KEoN(o;H4Hu zb4(QZ9fC z9iD`ZW`4pnt~g{+SD%Lg5`v;Y_%vpDSA5AcpThP%j1v?mshUhs6ezM|Hn3&Gx|rQt z57D8~CH{R+N8fy{#@{6-uaLbV4?@l313Sc|7(L{xSk zBC5c+G_1Mp13Q0#d+w5l%YL6aiF6PX7FA`pVDFvdck#!893O&8At)06-X$wrOa7~l zI+)Z&2guBVt3|L-%v=%?IuCn4G|!V=esa$_#Dykcz&E`xd{Vy(JN6r=9w0Sd=$>=B zI;~_8QL%9+qI$%Ls?+9s{#ZZEP{@@cxJRboo?8qoJ!9CV5JDc*Vts;S)snrESp8al z_kMivm{i4*6V)qH!sxuZN^;?SMV6*kQyhy&)d*|s_+E#{TDQLQZpHDv|FB&Q=WCAU_bTqr7D z-dmN@DmeIbuQDDETEo6c&v_$tC2Sqv3SIgbLq?|oauP)Kf=2a~@b0ynkjY&ftnb<9 z5|ra>SD0UUAxPd-WE@KPTh?w8g0icwS=HjHAJvPVk5A5w@Q+T8X9PvkG8AYdVRl-r z+|3D!Pxn4Gf}*qtS_$gjx=}r>{;CyJni*0@0U$dIIq~m{nkTi@z_AVgVAH9UMPH;^ zqSdLm*aVzAd_(BzH}BVPD!dr`_P@CS^HbINdN2Z_6^F_E&g7mHhlrpkcZ&kB-MchV z!^W?Tq0*wMh@cp<>M5XcL=d50%1a&VHWSrBSc`nAx2d~GGvQheXg zAty5vIZ4R{enDQQ7Us9FkG)F{3))5!B*rI0x3Lbq-Td+3;xj=cu%hEo58dDTjCw6>`{7{jdoN9Gh4|B{eHTUfP?Zpe zB1DG8!tJFOZPLou9eznUI7!t&$jcT|d7n~z{{|Dix@aF8iETd~LR@qrR8{KYU`+?9_xJL$pMHg-!L+_|@e2nBc2>(ueSLr&EGpcK%iEoQ#YQ{_?=) zsz`PX#Kw7hAtxsXPa9uQ@3~=&pxhmOgwH6|R)l5)4b&+?vw*VT)nf}ve@rSPmkg0NB_p0y z*D9IxF{#oz2a8q#yY^~^l|PJtf_&kX{3OI??-T$>7eDN{^bA>wa_Ba=6GqP(S|;JU zhgY5>J5%rgFLFUQ4o*@v6I6nMx73b3CkNT@-a$r1Mf4Ndl4%eaJR35Z=!Y3qy89OtHUv)iD}ptb?IzppM;}r=X6qVwsQQ$qRQ}H?@UU z3tcQ)KNSu1itoM?5gLP;U6#^Hq6hP$TJkH&Q5C>f_bLX|Py*K+qmYxB#4NEyo|Ic; z3NG4)3d)cxmyRkIK@q4$)4t!PpsRrs%q2{|}P)nKrx(mVFDDvJ@$;io~!U<8%r>tF2Z-oa-f2#Zg{fWe>P z^B&TlX^~g9V$m@4YNdx|Uu}SglW&2kp0}+(ZIa@FH@}J;oTNStZ_3m^Dj5f#1_Np7 z$c`vniIcjOkk>_Vs712|R_H;E7}O4PzV3|;Gk3$;)+6_Q)3c8yQ@yMF(s6K-`ZT;L zT9KH}%uoP8Anc|}<){cc#1sRq(SHqmByA79~DkAsucr$bcaI#Df)m-*W${O^u0LK4$+e;%EZfqjny1o_&LqfF4!NTeD1DN;jVgQ3sNIJ<|0wlwsVcE}GU?7St0AG!zTC zxa5KPMy0|dGDS!f%}`cEw{Fcbd~gS7s@Ey=ak+YiV&yNnns%+ZC&j@@ij!4ch~!WD z%E{Kkcl`{|x2a00_Y<3vi6>t1unI^(8O=@KfVD`SiDq_jB;ZH)tX-+ zj;aP8ld5Eh(-XGQwvl$7G7%I(U0Due+t$bS5lzrXF%^I8xPn6`A0a8Fbjm88KH-wh z%lztaaFXJXz?%LEPe>EG=av`k8`i=4AqMD>m52@N&cXb7$!Ba%Z-XS87x`u3;3UN% z@usMN?^Rq<1}?velG=W|G_H&F{dM8?$Oe0l+(SmzheZtDyz!$KcXmup{K{}}lH$mV zpO|G|vkQZDz&ok!l=RZ)bkjj6nG~$scoETYNkzMJcD{&>6n=h*Uk(mVQXK#Frk=8w zU#WEFcXK~Dh9*gM|K0WLVtR{u*s|p^!lDui{mjFsx&1z`^DDx^Ns6N|P&&dNDe2kR zYUYbao^ev$Idw>$-$4_*cHbxvP^VR0r#FHErw=1cTYf1xI7x97=1t`=Tf{b3r)Oqo zAAS;mZ4Uzwm7F29ztKcP5#yVwVBa72^1W~Q$_q&eX?&{3!AXjvL}ay6rrP{0L?mb zsH-T8^vrC8zstabw=uBukEf4QElw-3C?5w0Cn=5!AuDp5`iecP(acJ!f=6?`F^r(@ z@=@gAB*jr#ghipLPRyu_Lq}%R&R9=OT=L?Ng))1@Y%3!uS3YVSoTNB7I5;>-ad2>O zlH%at;NT?1!NI}7Ns5DmgOd~o2L}fyDGm+}PEs5k92`{x{}*5Yoi5uXc|y+l00000 LNkvXXu0mjfD4>h= diff --git a/src/paperless_tesseract/tests/test_ocr.py b/src/paperless_tesseract/tests/test_ocr.py index e0d5726ba..7124fbed6 100644 --- a/src/paperless_tesseract/tests/test_ocr.py +++ b/src/paperless_tesseract/tests/test_ocr.py @@ -1,34 +1,9 @@ import os from unittest import mock, skipIf -import pyocr from django.test import TestCase -from pyocr.libtesseract.tesseract_raw import \ - TesseractError as OtherTesseractError -from ..parsers import image_to_string, strip_excess_whitespace - - -class FakeTesseract(object): - - @staticmethod - def can_detect_orientation(): - return True - - @staticmethod - def detect_orientation(file_handle, lang): - raise OtherTesseractError("arbitrary status", "message") - - @staticmethod - def image_to_string(file_handle, lang): - return "This is test text" - - -class FakePyOcr(object): - - @staticmethod - def get_available_tools(): - return [FakeTesseract] +from ..parsers import strip_excess_whitespace class TestOCR(TestCase): @@ -45,9 +20,6 @@ class TestOCR(TestCase): ) ] - SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") - TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) - def test_strip_excess_whitespace(self): for source, result in self.text_cases: actual_result = strip_excess_whitespace(source) @@ -60,17 +32,3 @@ class TestOCR(TestCase): actual_result ) ) - - @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") - @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) - def test_image_to_string_with_text_free_page(self): - """ - This test is sort of silly, since it's really just reproducing an odd - exception thrown by pyocr when it encounters a page with no text. - Actually running this test against an installation of Tesseract results - in a segmentation fault rooted somewhere deep inside pyocr where I - don't care to dig. Regardless, if you run the consumer normally, - text-free pages are now handled correctly so long as we work around - this weird exception. - """ - image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"]) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 6d4323fc2..bc37b0b84 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -6,41 +6,13 @@ from typing import ContextManager from unittest import mock from django.test import TestCase, override_settings -from pyocr.error import TesseractError from documents.parsers import ParseError, run_convert -from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError +from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf image_to_string_calls = [] -class FakeTesseract(object): - - @staticmethod - def can_detect_orientation(): - return True - - @staticmethod - def detect_orientation(file_handle, lang): - raise TesseractError("arbitrary status", "message") - - @staticmethod - def get_available_languages(): - return ['eng', 'deu'] - - @staticmethod - def image_to_string(file_handle, lang): - image_to_string_calls.append((file_handle.name, lang)) - return file_handle.read() - - -class FakePyOcr(object): - - @staticmethod - def get_available_tools(): - return [FakeTesseract] - - def fake_convert(input_file, output_file, **kwargs): with open(input_file) as f: lines = f.readlines() @@ -50,12 +22,6 @@ def fake_convert(input_file, output_file, **kwargs): f2.write(line.strip()) -def fake_unpaper(pnm): - output = pnm + ".unpaper.pnm" - shutil.copy(pnm, output) - return output - - class FakeImageFile(ContextManager): def __init__(self, fname): self.fname = fname @@ -67,92 +33,6 @@ class FakeImageFile(ContextManager): return os.path.basename(self.fname) -fake_image = FakeImageFile - - -@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) -@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) -@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) -@mock.patch("paperless_tesseract.parsers.Image.open", open) -class TestRasterisedDocumentParser(TestCase): - - def setUp(self): - self.scratch = tempfile.mkdtemp() - - global image_to_string_calls - - image_to_string_calls = [] - - override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() - - def tearDown(self): - shutil.rmtree(self.scratch) - - def get_input_file(self, pages): - _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) - with open(fname, "w") as f: - f.writelines([f"line {p}\n" for p in range(pages)]) - return fname - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_simple_language_match(self): - parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_2_pages(self): - parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_3_pages(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) - def test_parse_text_lang_detect_failed(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") - def test_parse_text_lang_not_installed(self): - parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2 line 3") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") - def test_parse_text_lang_mismatch(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") - def test_parse_empty_doc(self): - parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) - try: - parser.get_text() - except ParseError as e: - self.assertEqual("Empty document, nothing to do.", str(e)) - else: - self.fail("Should raise exception") - - class TestAuxilliaryFunctions(TestCase): def setUp(self): @@ -173,32 +53,7 @@ class TestAuxilliaryFunctions(TestCase): def test_get_text_from_pdf_error(self): text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) - self.assertEqual(text.strip(), "") - - def test_image_to_string(self): - text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) - - self.assertEqual(text, "This is a test document.") - - def test_image_to_string_language_unavailable(self): - try: - image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) - except OCRError as e: - self.assertTrue("Failed loading language" in str(e)) - else: - self.fail("Should raise exception") - - @override_settings(OCR_ALWAYS=False) - @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") - @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") - def test_is_ocred(self, m2, m): - parser = RasterisedDocumentParser("", uuid.uuid4()) - m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ - "lots of text lots of text lots of text lots of text lots of text lots of text " \ - "lots of text lots of text lots of text lots of text lots of text lots of text " - parser.get_text() - self.assertEqual(m.call_count, 2) - self.assertEqual(m2.call_count, 0) + self.assertIsNone(text) def test_thumbnail(self): parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) From 65be22580ee3d23ceaa65d63e4c460e9b9ff7477 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 17:18:57 +0100 Subject: [PATCH 007/121] Add metadata field: has archive version --- src/documents/views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/documents/views.py b/src/documents/views.py index 4d62ae5c4..381eba3e1 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -169,6 +169,7 @@ class DocumentViewSet(RetrieveModelMixin, "paperless__checksum": doc.checksum, "paperless__mime_type": doc.mime_type, "paperless__filename": doc.filename, + "paperless__has_archive_version": os.path.isfile(doc.archive_path) }) except Document.DoesNotExist: raise Http404() From a5b549a7f9d4830dd0b92e583ae868eaace594ce Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 17:22:51 +0100 Subject: [PATCH 008/121] update git ignore --- .gitignore | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 4ae903ade..d63794fb3 100644 --- a/.gitignore +++ b/.gitignore @@ -76,16 +76,11 @@ scripts/nuke /static/ # Stored PDFs -/media/documents/originals/* -/media/documents/thumbnails/* - -/data/classification_model.pickle -/data/db.sqlite3 -/data/index - +/media/ +/data/ /paperless.conf -/consume -/export +/consume/ +/export/ /src-ui/.vscode # this is where the compiled frontend is moved to. From 5bb7514b8b38d783888310f357c6e0c28f60b659 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 17:23:57 +0100 Subject: [PATCH 009/121] codestyle --- src/documents/views.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/documents/views.py b/src/documents/views.py index 381eba3e1..87d1d31b1 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -138,7 +138,7 @@ class DocumentViewSet(RetrieveModelMixin, def file_response(self, pk, request, disposition): doc = Document.objects.get(id=pk) mime_type = doc.mime_type - if not self.original_requested(request) and os.path.isfile(doc.archive_path): + if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501 file_handle = doc.archive_file mime_type = 'application/pdf' elif doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: @@ -169,7 +169,8 @@ class DocumentViewSet(RetrieveModelMixin, "paperless__checksum": doc.checksum, "paperless__mime_type": doc.mime_type, "paperless__filename": doc.filename, - "paperless__has_archive_version": os.path.isfile(doc.archive_path) + "paperless__has_archive_version": + os.path.isfile(doc.archive_path) }) except Document.DoesNotExist: raise Http404() From f215f02afb9a4214744457ba5c8779e70c5d9a20 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 17:28:49 +0100 Subject: [PATCH 010/121] fixed up a test case --- src/documents/tests/test_document_model.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/documents/tests/test_document_model.py b/src/documents/tests/test_document_model.py index 5b27e2643..49e7139c8 100644 --- a/src/documents/tests/test_document_model.py +++ b/src/documents/tests/test_document_model.py @@ -1,12 +1,29 @@ +import os +import shutil +import tempfile +from pathlib import Path from unittest import mock -from django.test import TestCase +from django.test import TestCase, override_settings from ..models import Document, Correspondent class TestDocument(TestCase): + def setUp(self) -> None: + self.originals_dir = tempfile.mkdtemp() + self.thumb_dir = tempfile.mkdtemp() + + override_settings( + ORIGINALS_DIR=self.originals_dir, + THUMBNAIL_DIR=self.thumb_dir, + ).enable() + + def tearDown(self) -> None: + shutil.rmtree(self.originals_dir) + shutil.rmtree(self.thumb_dir) + def test_file_deletion(self): document = Document.objects.create( correspondent=Correspondent.objects.create(name="Test0"), @@ -19,6 +36,9 @@ class TestDocument(TestCase): file_path = document.source_path thumb_path = document.thumbnail_path + Path(file_path).touch() + Path(thumb_path).touch() + with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink: document.delete() mock_unlink.assert_any_call(file_path) From 1f52981493c6ba118f5f88c87f7262f385732c98 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 18:01:29 +0100 Subject: [PATCH 011/121] proper filenames for originals and archived documents --- src/documents/models.py | 4 ++++ src/documents/views.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/documents/models.py b/src/documents/models.py index c1ab9a44d..2644657a3 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -241,6 +241,10 @@ class Document(models.Model): def file_name(self): return slugify(str(self)) + self.file_type + @property + def archive_file_name(self): + return slugify(str(self)) + ".pdf" + @property def file_type(self): return mimetypes.guess_extension(str(self.mime_type)) diff --git a/src/documents/views.py b/src/documents/views.py index 87d1d31b1..457fa9dc7 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -138,8 +138,10 @@ class DocumentViewSet(RetrieveModelMixin, def file_response(self, pk, request, disposition): doc = Document.objects.get(id=pk) mime_type = doc.mime_type + filename = doc.file_name if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501 file_handle = doc.archive_file + filename = doc.archive_file_name mime_type = 'application/pdf' elif doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: file_handle = doc.source_file @@ -148,7 +150,7 @@ class DocumentViewSet(RetrieveModelMixin, response = HttpResponse(file_handle, content_type=mime_type) response["Content-Disposition"] = '{}; filename="{}"'.format( - disposition, doc.file_name) + disposition, filename) return response @action(methods=['post'], detail=False) From 0c1606b81ae6100d07daa9301e641ad5d1d152e4 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 18:01:43 +0100 Subject: [PATCH 012/121] frontend support for downloading originals --- .../document-detail.component.html | 26 ++++++++++++++----- .../document-detail.component.ts | 7 +++++ .../app/data/paperless-document-metadata.ts | 11 ++++++++ .../src/app/services/rest/document.service.ts | 21 ++++++++++++--- 4 files changed, 55 insertions(+), 10 deletions(-) create mode 100644 src-ui/src/app/data/paperless-document-metadata.ts diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index 9e1f8ad71..474c1376d 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -5,12 +5,26 @@ Delete - - - - - Download - + + + +     diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index 7c396692e..253833792 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -37,10 +37,10 @@ export class DocumentDetailComponent implements OnInit { title: new FormControl(''), content: new FormControl(''), created: new FormControl(), - correspondent_id: new FormControl(), - document_type_id: new FormControl(), + correspondent: new FormControl(), + document_type: new FormControl(), archive_serial_number: new FormControl(), - tags_id: new FormControl([]) + tags: new FormControl([]) }) constructor( @@ -93,7 +93,7 @@ export class DocumentDetailComponent implements OnInit { modal.componentInstance.success.subscribe(newDocumentType => { this.documentTypeService.listAll().subscribe(documentTypes => { this.documentTypes = documentTypes.results - this.documentForm.get('document_type_id').setValue(newDocumentType.id) + this.documentForm.get('document_type').setValue(newDocumentType.id) }) }) } @@ -104,7 +104,7 @@ export class DocumentDetailComponent implements OnInit { modal.componentInstance.success.subscribe(newCorrespondent => { this.correspondentService.listAll().subscribe(correspondents => { this.correspondents = correspondents.results - this.documentForm.get('correspondent_id').setValue(newCorrespondent.id) + this.documentForm.get('correspondent').setValue(newCorrespondent.id) }) }) } diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html index 4e86b6ddc..cf821b643 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html @@ -8,12 +8,12 @@
- - {{document.correspondent.name}} - {{document.correspondent.name}}: + + {{document.correspondent_object.name}} + {{document.correspondent_object.name}}: {{document.title}} - +
#{{document.archive_serial_number}}
diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts index 4a44909ec..ac2fdba27 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.ts @@ -20,10 +20,10 @@ export class DocumentCardLargeComponent implements OnInit { details: any @Output() - clickTag = new EventEmitter() + clickTag = new EventEmitter() @Output() - clickCorrespondent = new EventEmitter() + clickCorrespondent = new EventEmitter() ngOnInit(): void { } diff --git a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html index 4da5cdf9b..4ab48d5e6 100644 --- a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html +++ b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html @@ -1,15 +1,15 @@
-
- +
+

- - {{document.correspondent.name}}: + + {{document.correspondent_object.name}}: {{document.title}}

diff --git a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.ts b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.ts index 2c0ca8dfb..08202bfc9 100644 --- a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.ts +++ b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.ts @@ -16,10 +16,10 @@ export class DocumentCardSmallComponent implements OnInit { document: PaperlessDocument @Output() - clickTag = new EventEmitter() + clickTag = new EventEmitter() @Output() - clickCorrespondent = new EventEmitter() + clickCorrespondent = new EventEmitter() ngOnInit(): void { } diff --git a/src-ui/src/app/components/document-list/document-list.component.html b/src-ui/src/app/components/document-list/document-list.component.html index 48387b3e3..af7a049c7 100644 --- a/src-ui/src/app/components/document-list/document-list.component.html +++ b/src-ui/src/app/components/document-list/document-list.component.html @@ -100,17 +100,17 @@ {{d.archive_serial_number}} - - {{d.correspondent.name}} + + {{d.correspondent_object.name}} {{d.title}} - + - - {{d.document_type.name}} + + {{d.document_type_object.name}} diff --git a/src-ui/src/app/components/document-list/document-list.component.ts b/src-ui/src/app/components/document-list/document-list.component.ts index 153b31d8c..c3550a856 100644 --- a/src-ui/src/app/components/document-list/document-list.component.ts +++ b/src-ui/src/app/components/document-list/document-list.component.ts @@ -95,40 +95,40 @@ export class DocumentListComponent implements OnInit { }) } - filterByTag(t: PaperlessTag) { + filterByTag(tag_id: number) { let filterRules = this.list.filterRules - if (filterRules.find(rule => rule.type.id == FILTER_HAS_TAG && rule.value == t.id)) { + if (filterRules.find(rule => rule.type.id == FILTER_HAS_TAG && rule.value == tag_id)) { return } - filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_HAS_TAG), value: t.id}) + filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_HAS_TAG), value: tag_id}) this.filterRules = filterRules this.applyFilterRules() } - filterByCorrespondent(c: PaperlessCorrespondent) { + filterByCorrespondent(correspondent_id: number) { let filterRules = this.list.filterRules let existing_rule = filterRules.find(rule => rule.type.id == FILTER_CORRESPONDENT) - if (existing_rule && existing_rule.value == c.id) { + if (existing_rule && existing_rule.value == correspondent_id) { return } else if (existing_rule) { - existing_rule.value = c.id + existing_rule.value = correspondent_id } else { - filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_CORRESPONDENT), value: c.id}) + filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_CORRESPONDENT), value: correspondent_id}) } this.filterRules = filterRules this.applyFilterRules() } - filterByDocumentType(dt: PaperlessDocumentType) { + filterByDocumentType(document_type_id: number) { let filterRules = this.list.filterRules let existing_rule = filterRules.find(rule => rule.type.id == FILTER_DOCUMENT_TYPE) - if (existing_rule && existing_rule.value == dt.id) { + if (existing_rule && existing_rule.value == document_type_id) { return } else if (existing_rule) { - existing_rule.value = dt.id + existing_rule.value = document_type_id } else { - filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_DOCUMENT_TYPE), value: dt.id}) + filterRules.push({type: FILTER_RULE_TYPES.find(t => t.id == FILTER_DOCUMENT_TYPE), value: document_type_id}) } this.filterRules = filterRules this.applyFilterRules() diff --git a/src-ui/src/app/data/paperless-document.ts b/src-ui/src/app/data/paperless-document.ts index 31a24bcad..b69a35495 100644 --- a/src-ui/src/app/data/paperless-document.ts +++ b/src-ui/src/app/data/paperless-document.ts @@ -5,13 +5,13 @@ import { PaperlessDocumentType } from './paperless-document-type' export interface PaperlessDocument extends ObjectWithId { - correspondent?: PaperlessCorrespondent + correspondent_object?: PaperlessCorrespondent - correspondent_id?: number + correspondent?: number - document_type?: PaperlessDocumentType + document_type_object?: PaperlessDocumentType - document_type_id?: number + document_type?: number title?: string @@ -19,9 +19,9 @@ export interface PaperlessDocument extends ObjectWithId { file_type?: string - tags?: PaperlessTag[] + tags_objects?: PaperlessTag[] - tags_id?: number[] + tags?: number[] checksum?: string From 3f7d2b65eab353cf5451971d168fe0be7a7d91ec Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 01:17:55 +0100 Subject: [PATCH 093/121] remove _object from document results, which makes the API about 33% faster. --- src/documents/serialisers.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index e6f278f93..973ed2ae5 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -83,25 +83,16 @@ class DocumentSerializer(serializers.ModelSerializer): tags = TagsField(many=True) document_type = DocumentTypeField(allow_null=True) - correspondent_object = TagSerializer( - read_only=True, source="correspondent") - document_type_object = TagSerializer( - read_only=True, source="document_type") - tags_objects = TagSerializer(many=True, read_only=True, source="tags") - class Meta: model = Document depth = 1 fields = ( "id", "correspondent", - "correspondent_object", "document_type", - "document_type_object", "title", "content", "tags", - "tags_objects", "created", "modified", "added", From ea8c8702a4f54ccd0e85624fe228accca43800e1 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 01:18:06 +0100 Subject: [PATCH 094/121] fix a test case. --- src/documents/tests/test_api.py | 11 ++--------- src/documents/tests/test_post_consume_handlers.py | 3 +-- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index adfce313f..70b8bb9eb 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -41,15 +41,8 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): returned_doc = response.data['results'][0] self.assertEqual(returned_doc['id'], doc.id) self.assertEqual(returned_doc['title'], doc.title) - self.assertEqual(returned_doc['correspondent_object']['name'], c.name) - self.assertEqual(returned_doc['document_type_object']['name'], dt.name) - self.assertEqual(returned_doc['correspondent_object']['id'], c.id) - self.assertEqual(returned_doc['document_type_object']['id'], dt.id) - self.assertEqual(returned_doc['correspondent_object']['id'], returned_doc['correspondent']) - self.assertEqual(returned_doc['document_type_object']['id'], returned_doc['document_type']) - self.assertEqual(len(returned_doc['tags']), 1) - self.assertEqual(returned_doc['tags_objects'][0]['name'], tag.name) - self.assertEqual(returned_doc['tags_objects'][0]['id'], tag.id) + self.assertEqual(returned_doc['correspondent'], c.id) + self.assertEqual(returned_doc['document_type'], dt.id) self.assertListEqual(returned_doc['tags'], [tag.id]) c2 = Correspondent.objects.create(name="c2") diff --git a/src/documents/tests/test_post_consume_handlers.py b/src/documents/tests/test_post_consume_handlers.py index fb4c9fc12..b4357448c 100644 --- a/src/documents/tests/test_post_consume_handlers.py +++ b/src/documents/tests/test_post_consume_handlers.py @@ -53,5 +53,4 @@ class PostConsumeTestCase(TestCase): self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/") self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/") self.assertEqual(command[7], "my_bank") - # TODO: tags are unordered by default. - self.assertEqual(command[8], "a,b") + self.assertCountEqual(command[8].split(","), ["a", "b"]) From 1c23e10c396af0d42f79a38fe7426b2bdbf6a081 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 01:22:14 +0100 Subject: [PATCH 095/121] caching for listAll methods --- .../rest/abstract-paperless-service.ts | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src-ui/src/app/services/rest/abstract-paperless-service.ts b/src-ui/src/app/services/rest/abstract-paperless-service.ts index 16064c702..3feed320e 100644 --- a/src-ui/src/app/services/rest/abstract-paperless-service.ts +++ b/src-ui/src/app/services/rest/abstract-paperless-service.ts @@ -1,5 +1,6 @@ import { HttpClient, HttpParams } from '@angular/common/http' -import { Observable } from 'rxjs' +import { Observable, of, Subject } from 'rxjs' +import { map, publishReplay, refCount } from 'rxjs/operators' import { ObjectWithId } from 'src/app/data/object-with-id' import { Results } from 'src/app/data/results' import { environment } from 'src/environments/environment' @@ -51,8 +52,28 @@ export abstract class AbstractPaperlessService { return this.http.get>(this.getResourceUrl(), {params: httpParams}) } + private _listAll: Observable> + listAll(ordering?: string, extraParams?): Observable> { - return this.list(1, 100000, ordering, extraParams) + if (!this._listAll) { + this._listAll = this.list(1, 100000, ordering, extraParams).pipe( + publishReplay(1), + refCount() + ) + } + return this._listAll + } + + getCached(id: number): Observable { + return this.listAll().pipe( + map(list => list.results.find(o => o.id == id)) + ) + } + + getCachedMany(ids: number[]): Observable { + return this.listAll().pipe( + map(list => ids.map(id => list.results.find(o => o.id == id))) + ) } get(id: number): Observable { @@ -60,14 +81,17 @@ export abstract class AbstractPaperlessService { } create(o: T): Observable { + this._listAll = null return this.http.post(this.getResourceUrl(), o) } delete(o: T): Observable { + this._listAll = null return this.http.delete(this.getResourceUrl(o.id)) } update(o: T): Observable { + this._listAll = null return this.http.put(this.getResourceUrl(o.id), o) } } \ No newline at end of file From e10b4b02e1fa212f0e018b850cbd6ffb5dae018f Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 01:24:07 +0100 Subject: [PATCH 096/121] document service adds observables for linked data to its results --- src-ui/src/app/data/paperless-document.ts | 7 ++--- .../src/app/services/rest/document.service.ts | 26 +++++++++++++++++-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src-ui/src/app/data/paperless-document.ts b/src-ui/src/app/data/paperless-document.ts index b69a35495..9d0aeda88 100644 --- a/src-ui/src/app/data/paperless-document.ts +++ b/src-ui/src/app/data/paperless-document.ts @@ -2,14 +2,15 @@ import { PaperlessCorrespondent } from './paperless-correspondent' import { ObjectWithId } from './object-with-id' import { PaperlessTag } from './paperless-tag' import { PaperlessDocumentType } from './paperless-document-type' +import { Observable } from 'rxjs' export interface PaperlessDocument extends ObjectWithId { - correspondent_object?: PaperlessCorrespondent + correspondent$?: Observable correspondent?: number - document_type_object?: PaperlessDocumentType + document_type$?: Observable document_type?: number @@ -19,7 +20,7 @@ export interface PaperlessDocument extends ObjectWithId { file_type?: string - tags_objects?: PaperlessTag[] + tags$?: Observable tags?: number[] diff --git a/src-ui/src/app/services/rest/document.service.ts b/src-ui/src/app/services/rest/document.service.ts index e27dbeab3..5bf2308d4 100644 --- a/src-ui/src/app/services/rest/document.service.ts +++ b/src-ui/src/app/services/rest/document.service.ts @@ -6,6 +6,10 @@ import { HttpClient } from '@angular/common/http'; import { Observable } from 'rxjs'; import { Results } from 'src/app/data/results'; import { FilterRule } from 'src/app/data/filter-rule'; +import { map } from 'rxjs/operators'; +import { CorrespondentService } from './correspondent.service'; +import { DocumentTypeService } from './document-type.service'; +import { TagService } from './tag.service'; export const DOCUMENT_SORT_FIELDS = [ @@ -27,7 +31,7 @@ export const SORT_DIRECTION_DESCENDING = "des" }) export class DocumentService extends AbstractPaperlessService { - constructor(http: HttpClient) { + constructor(http: HttpClient, private correspondentService: CorrespondentService, private documentTypeService: DocumentTypeService, private tagService: TagService) { super(http, 'documents') } @@ -47,8 +51,26 @@ export class DocumentService extends AbstractPaperlessService } } + addObservablesToDocument(doc: PaperlessDocument) { + if (doc.correspondent) { + doc.correspondent$ = this.correspondentService.getCached(doc.correspondent) + } + if (doc.document_type) { + doc.document_type$ = this.documentTypeService.getCached(doc.document_type) + } + if (doc.tags) { + doc.tags$ = this.tagService.getCachedMany(doc.tags) + } + return doc + } + list(page?: number, pageSize?: number, sortField?: string, sortDirection?: string, filterRules?: FilterRule[]): Observable> { - return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules)) + return super.list(page, pageSize, sortField, sortDirection, this.filterRulesToQueryParams(filterRules)).pipe( + map(results => { + results.results.forEach(doc => this.addObservablesToDocument(doc)) + return results + }) + ) } getPreviewUrl(id: number, original: boolean = false): string { From e40fdd469d9a5c3aaf7c72129bbd6bc9fd355f63 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 01:25:52 +0100 Subject: [PATCH 097/121] use the observables everywhere in the application. --- .../saved-view-widget/saved-view-widget.component.html | 2 +- .../document-card-large.component.html | 8 ++++---- .../document-card-small.component.html | 6 +++--- .../document-list/document-list.component.html | 10 +++++----- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src-ui/src/app/components/dashboard/widgets/saved-view-widget/saved-view-widget.component.html b/src-ui/src/app/components/dashboard/widgets/saved-view-widget/saved-view-widget.component.html index a444474ea..e63ecc47b 100644 --- a/src-ui/src/app/components/dashboard/widgets/saved-view-widget/saved-view-widget.component.html +++ b/src-ui/src/app/components/dashboard/widgets/saved-view-widget/saved-view-widget.component.html @@ -13,7 +13,7 @@ {{doc.created | date}} - {{doc.title}} + {{doc.title}} diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html index cf821b643..bfc59b526 100644 --- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html +++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html @@ -8,12 +8,12 @@
- - {{document.correspondent_object.name}} - {{document.correspondent_object.name}}: + + {{(document.correspondent$ | async)?.name}} + {{(document.correspondent$ | async)?.name}}: {{document.title}} - +
#{{document.archive_serial_number}}
diff --git a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html index 4ab48d5e6..71a7fb01a 100644 --- a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html +++ b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html @@ -1,15 +1,15 @@
-
+

- - {{document.correspondent_object.name}}: + + {{(document.correspondent$ | async)?.name}}: {{document.title}}

diff --git a/src-ui/src/app/components/document-list/document-list.component.html b/src-ui/src/app/components/document-list/document-list.component.html index af7a049c7..cebe7c544 100644 --- a/src-ui/src/app/components/document-list/document-list.component.html +++ b/src-ui/src/app/components/document-list/document-list.component.html @@ -100,17 +100,17 @@ {{d.archive_serial_number}} - - {{d.correspondent_object.name}} + + {{(d.correspondent$ | async)?.name}} {{d.title}} - + - - {{d.document_type_object.name}} + + {{(d.document_type$ | async)?.name}} From 5765b84f3ec6d42cb31045ee5ea61d96c865ca9e Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 01:26:12 +0100 Subject: [PATCH 098/121] bugfix --- .../src/app/components/document-list/document-list.component.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src-ui/src/app/components/document-list/document-list.component.ts b/src-ui/src/app/components/document-list/document-list.component.ts index c3550a856..3a4f17196 100644 --- a/src-ui/src/app/components/document-list/document-list.component.ts +++ b/src-ui/src/app/components/document-list/document-list.component.ts @@ -49,13 +49,13 @@ export class DocumentListComponent implements OnInit { this.displayMode = localStorage.getItem('document-list:displayMode') } this.route.paramMap.subscribe(params => { + this.filterRules = this.list.filterRules if (params.has('id')) { this.list.savedView = this.savedViewConfigService.getConfig(params.get('id')) } else { this.list.savedView = null this.showFilter = this.filterRules.length > 0 } - this.filterRules = this.list.filterRules this.list.clear() this.list.reload() }) From 834ccb60e8c93e4068aa6408e7436a83e5d0981a Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 01:26:27 +0100 Subject: [PATCH 099/121] add observables to search results --- src-ui/src/app/services/rest/search.service.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src-ui/src/app/services/rest/search.service.ts b/src-ui/src/app/services/rest/search.service.ts index 2da5f9a08..b19a55769 100644 --- a/src-ui/src/app/services/rest/search.service.ts +++ b/src-ui/src/app/services/rest/search.service.ts @@ -1,9 +1,11 @@ import { HttpClient, HttpParams } from '@angular/common/http'; import { Injectable } from '@angular/core'; import { Observable } from 'rxjs'; +import { map } from 'rxjs/operators'; import { PaperlessDocument } from 'src/app/data/paperless-document'; import { SearchResult } from 'src/app/data/search-result'; import { environment } from 'src/environments/environment'; +import { DocumentService } from './document.service'; @Injectable({ @@ -11,14 +13,19 @@ import { environment } from 'src/environments/environment'; }) export class SearchService { - constructor(private http: HttpClient) { } + constructor(private http: HttpClient, private documentService: DocumentService) { } search(query: string, page?: number): Observable { let httpParams = new HttpParams().set('query', query) if (page) { httpParams = httpParams.set('page', page.toString()) } - return this.http.get(`${environment.apiBaseUrl}search/`, {params: httpParams}) + return this.http.get(`${environment.apiBaseUrl}search/`, {params: httpParams}).pipe( + map(result => { + result.results.forEach(hit => this.documentService.addObservablesToDocument(hit.document)) + return result + }) + ) } autocomplete(term: string): Observable { From 75bdeefa18ebf0dcb635e74f6bec7233fd3e02d6 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 11:12:59 +0100 Subject: [PATCH 100/121] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 36c365f17..45427ef66 100644 --- a/README.md +++ b/README.md @@ -55,10 +55,12 @@ For a complete list of changes from paperless, check out the [changelog](https:/ ## Roadmap for versions beyond 1.0 +These are things that I want to add to paperless eventually. They are sorted by priority. + +- **Bulk editing**. Add/remove metadata from multiple documents at once. - **More search.** The search backend is incredibly versatile and customizable. Searching is the most important feature of this project and thus, I want to implement things like: - Group and limit search results by correspondent, show “more from this” links in the results. - Ability to search for “Similar documents” in the search results -- **Bulk editing**. Add/remove metadata from multiple documents at once. - **Nested tags**. Organize tags in a hierarchical structure. This will combine the benefits of folders and tags in one coherent system. - **An interactive consumer** that shows its progress for documents it processes on the web page. - With live updates ans websockets. This already works on a dev branch, but requires a lot of new dependencies, which I'm not particular happy about. From d888e1b224c6d5e3e69963254b4458df3c628b98 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 12:09:21 +0100 Subject: [PATCH 101/121] API now supports setting metadata when POSTing documents. --- docs/api.rst | 4 +-- docs/changelog.rst | 14 +++++++--- src/documents/serialisers.py | 27 ++++++++++--------- src/documents/tests/test_api.py | 46 +++++++++++++++++---------------- 4 files changed, 49 insertions(+), 42 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 523ca1b45..4c9ae0b13 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -216,9 +216,7 @@ be instructed to consume the document from there. The endpoint supports the following optional form fields: * ``title``: Specify a title that the consumer should use for the document. -* ``correspondent``: Specify a correspondent that the consumer should use for the document. - Case sensitive. If the specified correspondent does not exist, it will be created with this - name and default settings. +* ``correspondent``: Specify the ID of a correspondent that the consumer should use for the document. * ``document_type``: Similar to correspondent. * ``tags``: Similar to correspondent. Specify this multiple times to have multiple tags added to the document. diff --git a/docs/changelog.rst b/docs/changelog.rst index 9ccc7bd6a..40b45d1b1 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,6 +8,9 @@ Changelog paperless-ng 0.9.5 ################## +Apart from the API, this finalizes the changes I wanted to get into paperless before 1.0. The next releases will +focus on fixing bugs, minor changes to the UI, and possibly some changes to the API. + * OCR * Paperless now uses `OCRmyPDF `_ to perform OCR on documents. @@ -33,10 +36,15 @@ paperless-ng 0.9.5 * The endpoint for uploading documents now supports specifying custom titles, correspondents, tags and types. This can be used by clients to override the default behavior of paperless. * The document endpoint of API now serves document in this form: - * correspondents, document types and tags are referenced by their ID in the fields ``correspondent``, ``document_type`` and ``tags``. The ``*_id`` versions are gone. These fields are read/write. - * in addition to that, ``*_object`` fields serve nested objects. Read only. Don't rely on these, they will probably get removed once I figure out how to better handle asynchronous data in the front end. -* Some minor improvements to the front end, such as document count in the document list, better visibility of the current view, and improvements to the filter behavior. + * correspondents, document types and tags are referenced by their ID in the fields ``correspondent``, ``document_type`` and ``tags``. The ``*_id`` versions are gone. These fields are read/write. + * paperless does not serve nested tags, correspondents or types anymore. + +* Front end + + * Paperless does some basic caching of correspondents, tags and types and will only request them from the server when necessary or when entirely reloading the page. + * Document lists should be somewhat faster now, especially when lots of tags/correspondents where present. + * Some minor improvements to the front end, such as document count in the document list, better highlighting of the current page, and improvements to the filter behavior. * Fixes: diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 973ed2ae5..c988b2137 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -126,22 +126,26 @@ class PostDocumentSerializer(serializers.Serializer): required=False, ) - correspondent = serializers.CharField( + correspondent = serializers.PrimaryKeyRelatedField( + queryset=Correspondent.objects.all(), label="Correspondent", + allow_null=True, write_only=True, required=False, ) - document_type = serializers.CharField( + document_type = serializers.PrimaryKeyRelatedField( + queryset=DocumentType.objects.all(), label="Document type", + allow_null=True, write_only=True, required=False, ) - tags = serializers.ListField( - child=serializers.CharField(), + tags = serializers.PrimaryKeyRelatedField( + many=True, + queryset=Tag.objects.all(), label="Tags", - source="tag", write_only=True, required=False, ) @@ -170,24 +174,19 @@ class PostDocumentSerializer(serializers.Serializer): correspondent = attrs.get('correspondent') if correspondent: - c, _ = Correspondent.objects.get_or_create(name=correspondent) - attrs['correspondent_id'] = c.id + attrs['correspondent_id'] = correspondent.id else: attrs['correspondent_id'] = None document_type = attrs.get('document_type') if document_type: - dt, _ = DocumentType.objects.get_or_create(name=document_type) - attrs['document_type_id'] = dt.id + attrs['document_type_id'] = document_type.id else: attrs['document_type_id'] = None - tags = attrs.get('tag') + tags = attrs.get('tags') if tags: - tag_ids = [] - for tag in tags: - tag, _ = Tag.objects.get_or_create(name=tag) - tag_ids.append(tag.id) + tag_ids = [tag.id for tag in tags] attrs['tag_ids'] = tag_ids else: attrs['tag_ids'] = None diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index 70b8bb9eb..b900ee653 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -410,7 +410,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): def test_upload_with_correspondent(self, async_task): c = Correspondent.objects.create(name="test-corres") with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: - response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": "test-corres"}) + response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": c.id}) self.assertEqual(response.status_code, 200) async_task.assert_called_once() @@ -420,23 +420,18 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(kwargs['override_correspondent_id'], c.id) @mock.patch("documents.views.async_task") - def test_upload_with_new_correspondent(self, async_task): + def test_upload_with_invalid_correspondent(self, async_task): with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: - response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": "test-corres2"}) - self.assertEqual(response.status_code, 200) + response = self.client.post("/api/documents/post_document/", {"document": f, "correspondent": 3456}) + self.assertEqual(response.status_code, 400) - async_task.assert_called_once() - - args, kwargs = async_task.call_args - - c = Correspondent.objects.get(name="test-corres2") - self.assertEqual(kwargs['override_correspondent_id'], c.id) + async_task.assert_not_called() @mock.patch("documents.views.async_task") def test_upload_with_document_type(self, async_task): dt = DocumentType.objects.create(name="invoice") with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: - response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": "invoice"}) + response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": dt.id}) self.assertEqual(response.status_code, 200) async_task.assert_called_once() @@ -446,30 +441,37 @@ class TestDocumentApi(DirectoriesMixin, APITestCase): self.assertEqual(kwargs['override_document_type_id'], dt.id) @mock.patch("documents.views.async_task") - def test_upload_with_new_document_type(self, async_task): + def test_upload_with_invalid_document_type(self, async_task): with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: - response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": "invoice2"}) - self.assertEqual(response.status_code, 200) + response = self.client.post("/api/documents/post_document/", {"document": f, "document_type": 34578}) + self.assertEqual(response.status_code, 400) - async_task.assert_called_once() - - args, kwargs = async_task.call_args - - dt = DocumentType.objects.get(name="invoice2") - self.assertEqual(kwargs['override_document_type_id'], dt.id) + async_task.assert_not_called() @mock.patch("documents.views.async_task") def test_upload_with_tags(self, async_task): t1 = Tag.objects.create(name="tag1") + t2 = Tag.objects.create(name="tag2") with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: response = self.client.post( "/api/documents/post_document/", - {"document": f, "tags": ["tag1", "tag2"]}) + {"document": f, "tags": [t2.id, t1.id]}) self.assertEqual(response.status_code, 200) async_task.assert_called_once() args, kwargs = async_task.call_args - t2 = Tag.objects.get(name="tag2") self.assertCountEqual(kwargs['override_tag_ids'], [t1.id, t2.id]) + + @mock.patch("documents.views.async_task") + def test_upload_with_invalid_tags(self, async_task): + t1 = Tag.objects.create(name="tag1") + t2 = Tag.objects.create(name="tag2") + with open(os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"), "rb") as f: + response = self.client.post( + "/api/documents/post_document/", + {"document": f, "tags": [t2.id, t1.id, 734563]}) + self.assertEqual(response.status_code, 400) + + async_task.assert_not_called() From 884eec9b611c15771991c0f26c3978f7e0c7d3c9 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 12:44:02 +0100 Subject: [PATCH 102/121] disabled thumbnail trimming. --- src/paperless_tesseract/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 66001286c..454617728 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -31,7 +31,7 @@ class RasterisedDocumentParser(DocumentParser): scale="500x5000>", alpha="remove", strip=True, - trim=True, + trim=False, input_file="{}[0]".format(document_path), output_file=out_path, logging_group=self.logging_group) @@ -55,7 +55,7 @@ class RasterisedDocumentParser(DocumentParser): scale="500x5000>", alpha="remove", strip=True, - trim=True, + trim=False, input_file=gs_out_path, output_file=out_path, logging_group=self.logging_group) From 249422570ebc88b9d76a981839a26757c8f2040c Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 15:42:05 +0100 Subject: [PATCH 103/121] mail handling: When exceptions occur during account/rule/message handling, paperless will continue with the next account/rule/message. mail handling: When paperless encounters a very long fixes #82 --- src/documents/loggers.py | 4 +- src/paperless/settings.py | 2 +- src/paperless_mail/mail.py | 230 ++++++++++++++------------ src/paperless_mail/models.py | 2 +- src/paperless_mail/tasks.py | 14 +- src/paperless_mail/tests/test_mail.py | 85 +++++++--- 6 files changed, 197 insertions(+), 140 deletions(-) diff --git a/src/documents/loggers.py b/src/documents/loggers.py index 76dbe0163..863bc0c34 100644 --- a/src/documents/loggers.py +++ b/src/documents/loggers.py @@ -28,10 +28,10 @@ class LoggingMixin: def renew_logging_group(self): self.logging_group = uuid.uuid4() - def log(self, level, message): + def log(self, level, message, **kwargs): target = ".".join([self.__class__.__module__, self.__class__.__name__]) logger = logging.getLogger(target) getattr(logger, level)(message, extra={ "group": self.logging_group - }) + }, **kwargs) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 88915c7c5..c7ecf7645 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -274,7 +274,7 @@ LOGGING = { "class": "documents.loggers.PaperlessHandler", }, "console": { - "level": "WARNING", + "level": "INFO", "class": "logging.StreamHandler", "formatter": "verbose", } diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index 1ce4fe825..08f7365da 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -4,6 +4,7 @@ from datetime import timedelta, date import magic from django.conf import settings +from django.db import DatabaseError from django.utils.text import slugify from django_q.tasks import async_task from imap_tools import MailBox, MailBoxUnencrypted, AND, MailMessageFlags, \ @@ -86,46 +87,6 @@ def make_criterias(rule): return {**criterias, **get_rule_action(rule).get_criteria()} -def get_title(message, att, rule): - if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT: - title = message.subject - elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME: - title = os.path.splitext(os.path.basename(att.filename))[0] - else: - raise ValueError("Unknown title selector.") - - return title - - -def get_correspondent(message, rule): - if rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NOTHING: - correspondent = None - elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_EMAIL: - correspondent_name = message.from_ - correspondent = Correspondent.objects.get_or_create( - name=correspondent_name, defaults={ - "slug": slugify(correspondent_name) - })[0] - elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_NAME: - if message.from_values and \ - 'name' in message.from_values \ - and message.from_values['name']: - correspondent_name = message.from_values['name'] - else: - correspondent_name = message.from_ - - correspondent = Correspondent.objects.get_or_create( - name=correspondent_name, defaults={ - "slug": slugify(correspondent_name) - })[0] - elif rule.assign_correspondent_from == MailRule.CORRESPONDENT_FROM_CUSTOM: - correspondent = rule.assign_correspondent - else: - raise ValueError("Unknwown correspondent selector") - - return correspondent - - def get_mailbox(server, port, security): if security == MailAccount.IMAP_SECURITY_NONE: mailbox = MailBoxUnencrypted(server, port) @@ -140,6 +101,51 @@ def get_mailbox(server, port, security): class MailAccountHandler(LoggingMixin): + def _correspondent_from_name(self, name): + try: + return Correspondent.objects.get_or_create( + name=name, defaults={ + "slug": slugify(name) + })[0] + except DatabaseError as e: + self.log( + "error", + f"Error while retrieving correspondent {name}: {e}" + ) + return None + + def get_title(self, message, att, rule): + if rule.assign_title_from == MailRule.TITLE_FROM_SUBJECT: + return message.subject + + elif rule.assign_title_from == MailRule.TITLE_FROM_FILENAME: + return os.path.splitext(os.path.basename(att.filename))[0] + + else: + raise ValueError("Unknown title selector.") + + def get_correspondent(self, message, rule): + c_from = rule.assign_correspondent_from + + if c_from == MailRule.CORRESPONDENT_FROM_NOTHING: + return None + + elif c_from == MailRule.CORRESPONDENT_FROM_EMAIL: + return self._correspondent_from_name(message.from_) + + elif c_from == MailRule.CORRESPONDENT_FROM_NAME: + if message.from_values and 'name' in message.from_values and message.from_values['name']: # NOQA: E501 + return self._correspondent_from_name( + message.from_values['name']) + else: + return self._correspondent_from_name(message.from_) + + elif c_from == MailRule.CORRESPONDENT_FROM_CUSTOM: + return rule.assign_correspondent + + else: + raise ValueError("Unknwown correspondent selector") + def handle_mail_account(self, account): self.renew_logging_group() @@ -156,79 +162,89 @@ class MailAccountHandler(LoggingMixin): M.login(account.username, account.password) except Exception: raise MailError( - f"Error while authenticating account {account.name}") + f"Error while authenticating account {account}") self.log('debug', f"Account {account}: Processing " f"{account.rules.count()} rule(s)") for rule in account.rules.order_by('order'): - self.log( - 'debug', - f"Account {account}: Processing rule {rule.name}") - - self.log( - 'debug', - f"Rule {account}.{rule}: Selecting folder {rule.folder}") - try: - M.folder.set(rule.folder) - except MailboxFolderSelectError: - raise MailError( - f"Rule {rule.name}: Folder {rule.folder} " - f"does not exist in account {account.name}") + total_processed_files += self.handle_mail_rule(M, rule) + except Exception as e: + self.log( + "error", + f"Rule {rule}: Error while processing rule: {e}", + exc_info=True + ) - criterias = make_criterias(rule) + return total_processed_files + def handle_mail_rule(self, M, rule): + + self.log( + 'debug', + f"Rule {rule}: Selecting folder {rule.folder}") + + try: + M.folder.set(rule.folder) + except MailboxFolderSelectError: + raise MailError( + f"Rule {rule}: Folder {rule.folder} " + f"does not exist in account {rule.account}") + + criterias = make_criterias(rule) + + self.log( + 'debug', + f"Rule {rule}: Searching folder with criteria " + f"{str(AND(**criterias))}") + + try: + messages = M.fetch(criteria=AND(**criterias), + mark_seen=False) + except Exception: + raise MailError( + f"Rule {rule}: Error while fetching folder {rule.folder}") + + post_consume_messages = [] + + mails_processed = 0 + total_processed_files = 0 + + for message in messages: + try: + processed_files = self.handle_message(message, rule) + if processed_files > 0: + post_consume_messages.append(message.uid) + + total_processed_files += processed_files + mails_processed += 1 + except Exception as e: self.log( - 'debug', - f"Rule {account}.{rule}: Searching folder with criteria " - f"{str(AND(**criterias))}") + "error", + f"Rule {rule}: Error while processing mail " + f"{message.uid}: {e}", + exc_info=True) - try: - messages = M.fetch(criteria=AND(**criterias), - mark_seen=False) - except Exception: - raise MailError( - f"Rule {rule.name}: Error while fetching folder " - f"{rule.folder} of account {account.name}") + self.log( + 'debug', + f"Rule {rule}: Processed {mails_processed} matching mail(s)") - post_consume_messages = [] + self.log( + 'debug', + f"Rule {rule}: Running mail actions on " + f"{len(post_consume_messages)} mails") - mails_processed = 0 + try: + get_rule_action(rule).post_consume( + M, + post_consume_messages, + rule.action_parameter) - for message in messages: - try: - processed_files = self.handle_message(message, rule) - except Exception: - raise MailError( - f"Rule {rule.name}: Error while processing mail " - f"{message.uid} of account {account.name}") - if processed_files > 0: - post_consume_messages.append(message.uid) - - total_processed_files += processed_files - mails_processed += 1 - - self.log( - 'debug', - f"Rule {account}.{rule}: Processed {mails_processed} " - f"matching mail(s)") - - self.log( - 'debug', - f"Rule {account}.{rule}: Running mail actions on " - f"{len(post_consume_messages)} mails") - - try: - get_rule_action(rule).post_consume( - M, - post_consume_messages, - rule.action_parameter) - - except Exception: - raise MailError( - f"Rule {rule.name}: Error while processing " - f"post-consume actions for account {account.name}") + except Exception as e: + raise MailError( + f"Rule {rule}: Error while processing post-consume actions: " + f"{e}") return total_processed_files @@ -238,11 +254,11 @@ class MailAccountHandler(LoggingMixin): self.log( 'debug', - f"Rule {rule.account}.{rule}: " + f"Rule {rule}: " f"Processing mail {message.subject} from {message.from_} with " f"{len(message.attachments)} attachment(s)") - correspondent = get_correspondent(message, rule) + correspondent = self.get_correspondent(message, rule) tag = rule.assign_tag doc_type = rule.assign_document_type @@ -253,12 +269,12 @@ class MailAccountHandler(LoggingMixin): if not att.content_disposition == "attachment": self.log( 'debug', - f"Rule {rule.account}.{rule}: " + f"Rule {rule}: " f"Skipping attachment {att.filename} " - f"with content disposition inline") + f"with content disposition {att.content_disposition}") continue - title = get_title(message, att, rule) + title = self.get_title(message, att, rule) # don't trust the content type of the attachment. Could be # generic application/octet-stream. @@ -274,7 +290,7 @@ class MailAccountHandler(LoggingMixin): self.log( 'info', - f"Rule {rule.account}.{rule}: " + f"Rule {rule}: " f"Consuming attachment {att.filename} from mail " f"{message.subject} from {message.from_}") @@ -293,7 +309,7 @@ class MailAccountHandler(LoggingMixin): else: self.log( 'debug', - f"Rule {rule.account}.{rule}: " + f"Rule {rule}: " f"Skipping attachment {att.filename} " f"since guessed mime type {mime_type} is not supported " f"by paperless") diff --git a/src/paperless_mail/models.py b/src/paperless_mail/models.py index fbcfaf980..aa1ac5684 100644 --- a/src/paperless_mail/models.py +++ b/src/paperless_mail/models.py @@ -139,4 +139,4 @@ class MailRule(models.Model): ) def __str__(self): - return self.name + return f"{self.account.name}.{self.name}" diff --git a/src/paperless_mail/tasks.py b/src/paperless_mail/tasks.py index 2eb4cbf74..68fb859a4 100644 --- a/src/paperless_mail/tasks.py +++ b/src/paperless_mail/tasks.py @@ -1,14 +1,20 @@ import logging -from paperless_mail.mail import MailAccountHandler +from paperless_mail.mail import MailAccountHandler, MailError from paperless_mail.models import MailAccount def process_mail_accounts(): total_new_documents = 0 for account in MailAccount.objects.all(): - total_new_documents += MailAccountHandler().handle_mail_account( - account) + try: + total_new_documents += MailAccountHandler().handle_mail_account( + account) + except MailError as e: + logging.getLogger(__name__).error( + f"Error while processing mail account {account}: {e}", + exc_info=True + ) if total_new_documents > 0: return f"Added {total_new_documents} document(s)." @@ -21,4 +27,4 @@ def process_mail_account(name): account = MailAccount.objects.get(name=name) MailAccountHandler().handle_mail_account(account) except MailAccount.DoesNotExist: - logging.error("Unknown mail acccount: {}".format(name)) + logging.getLogger(__name__).error(f"Unknown mail acccount: {name}") diff --git a/src/paperless_mail/tests/test_mail.py b/src/paperless_mail/tests/test_mail.py index 6a737cfa5..3cd3e8499 100644 --- a/src/paperless_mail/tests/test_mail.py +++ b/src/paperless_mail/tests/test_mail.py @@ -4,12 +4,13 @@ from typing import ContextManager from unittest import mock from django.core.management import call_command +from django.db import DatabaseError from django.test import TestCase from imap_tools import MailMessageFlags, MailboxFolderSelectError from documents.models import Correspondent from paperless_mail import tasks -from paperless_mail.mail import MailError, MailAccountHandler, get_correspondent, get_title +from paperless_mail.mail import MailError, MailAccountHandler from paperless_mail.models import MailRule, MailAccount @@ -165,28 +166,30 @@ class TestMail(TestCase): me_localhost = Correspondent.objects.create(name=message2.from_) someone_else = Correspondent.objects.create(name="someone else") + handler = MailAccountHandler() + rule = MailRule(name="a", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NOTHING) - self.assertIsNone(get_correspondent(message, rule)) + self.assertIsNone(handler.get_correspondent(message, rule)) rule = MailRule(name="b", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL) - c = get_correspondent(message, rule) + c = handler.get_correspondent(message, rule) self.assertIsNotNone(c) self.assertEqual(c.name, "someone@somewhere.com") - c = get_correspondent(message2, rule) + c = handler.get_correspondent(message2, rule) self.assertIsNotNone(c) self.assertEqual(c.name, "me@localhost.com") self.assertEqual(c.id, me_localhost.id) rule = MailRule(name="c", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_NAME) - c = get_correspondent(message, rule) + c = handler.get_correspondent(message, rule) self.assertIsNotNone(c) self.assertEqual(c.name, "Someone!") - c = get_correspondent(message2, rule) + c = handler.get_correspondent(message2, rule) self.assertIsNotNone(c) self.assertEqual(c.id, me_localhost.id) rule = MailRule(name="d", assign_correspondent_from=MailRule.CORRESPONDENT_FROM_CUSTOM, assign_correspondent=someone_else) - c = get_correspondent(message, rule) + c = handler.get_correspondent(message, rule) self.assertEqual(c, someone_else) def test_get_title(self): @@ -194,10 +197,13 @@ class TestMail(TestCase): message.subject = "the message title" att = namedtuple('Attachment', []) att.filename = "this_is_the_file.pdf" + + handler = MailAccountHandler() + rule = MailRule(name="a", assign_title_from=MailRule.TITLE_FROM_FILENAME) - self.assertEqual(get_title(message, att, rule), "this_is_the_file") + self.assertEqual(handler.get_title(message, att, rule), "this_is_the_file") rule = MailRule(name="b", assign_title_from=MailRule.TITLE_FROM_SUBJECT) - self.assertEqual(get_title(message, att, rule), "the message title") + self.assertEqual(handler.get_title(message, att, rule), "the message title") def test_handle_message(self): message = create_message(subject="the message title", from_="Myself", num_attachments=2) @@ -319,7 +325,7 @@ class TestMail(TestCase): self.assertEqual(len(self.bogus_mailbox.messages), 2) self.assertEqual(len(self.bogus_mailbox.messages_spam), 1) - def test_errors(self): + def test_error_login(self): account = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wrong") try: @@ -329,26 +335,55 @@ class TestMail(TestCase): else: self.fail("Should raise exception") + def test_error_skip_account(self): + account_faulty = MailAccount.objects.create(name="test", imap_server="", username="admin", password="wroasdng") + account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret") - rule = MailRule.objects.create(name="testrule", account=account, folder="uuuh") + rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, + action_parameter="spam", filter_subject="Claim") - try: - self.mail_account_handler.handle_mail_account(account) - except MailError as e: - self.assertTrue("uuuh does not exist" in str(e)) - else: - self.fail("Should raise exception") + tasks.process_mail_accounts() + self.assertEqual(self.async_task.call_count, 1) + self.assertEqual(len(self.bogus_mailbox.messages), 2) + self.assertEqual(len(self.bogus_mailbox.messages_spam), 1) - account = MailAccount.objects.create(name="test3", imap_server="", username="admin", password="secret") + def test_error_skip_rule(self): - rule = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, action_parameter="doesnotexist", filter_subject="Claim") + account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret") + rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, + action_parameter="spam", filter_subject="Claim", order=1, folder="uuuhhhh") + rule2 = MailRule.objects.create(name="testrule2", account=account, action=MailRule.ACTION_MOVE, + action_parameter="spam", filter_subject="Claim", order=2) + + self.mail_account_handler.handle_mail_account(account) + self.assertEqual(self.async_task.call_count, 1) + self.assertEqual(len(self.bogus_mailbox.messages), 2) + self.assertEqual(len(self.bogus_mailbox.messages_spam), 1) + + + @mock.patch("paperless_mail.mail.MailAccountHandler.get_correspondent") + def test_error_skip_mail(self, m): + + def get_correspondent_fake(message, rule): + if message.from_ == 'amazon@amazon.de': + raise ValueError("Does not compute.") + else: + return None + + m.side_effect = get_correspondent_fake + + account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret") + rule = MailRule.objects.create(name="testrule", account=account, action=MailRule.ACTION_MOVE, action_parameter="spam") + + self.mail_account_handler.handle_mail_account(account) + + # test that we still consume mail even if some mails throw errors. + self.assertEqual(self.async_task.call_count, 2) + + # faulty mail still in inbox, untouched + self.assertEqual(len(self.bogus_mailbox.messages), 1) + self.assertEqual(self.bogus_mailbox.messages[0].from_, 'amazon@amazon.de') - try: - self.mail_account_handler.handle_mail_account(account) - except MailError as e: - self.assertTrue("Error while processing post-consume actions" in str(e)) - else: - self.fail("Should raise exception") def test_filters(self): From 5a978134ff0f85bd63fed574f7772935f81a8e8b Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 15:56:26 +0100 Subject: [PATCH 104/121] more tests --- src/paperless_mail/tests/test_mail.py | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/paperless_mail/tests/test_mail.py b/src/paperless_mail/tests/test_mail.py index 3cd3e8499..2a391a268 100644 --- a/src/paperless_mail/tests/test_mail.py +++ b/src/paperless_mail/tests/test_mail.py @@ -384,6 +384,35 @@ class TestMail(TestCase): self.assertEqual(len(self.bogus_mailbox.messages), 1) self.assertEqual(self.bogus_mailbox.messages[0].from_, 'amazon@amazon.de') + def test_error_create_correspondent(self): + + account = MailAccount.objects.create(name="test2", imap_server="", username="admin", password="secret") + rule = MailRule.objects.create( + name="testrule", filter_from="amazon@amazon.de", + account=account, action=MailRule.ACTION_MOVE, action_parameter="spam", + assign_correspondent_from=MailRule.CORRESPONDENT_FROM_EMAIL) + + self.mail_account_handler.handle_mail_account(account) + + self.async_task.assert_called_once() + args, kwargs = self.async_task.call_args + + c = Correspondent.objects.get(name="amazon@amazon.de") + # should work + self.assertEquals(kwargs['override_correspondent_id'], c.id) + + self.async_task.reset_mock() + self.reset_bogus_mailbox() + + with mock.patch("paperless_mail.mail.Correspondent.objects.get_or_create") as m: + m.side_effect = DatabaseError() + + self.mail_account_handler.handle_mail_account(account) + + args, kwargs = self.async_task.call_args + self.async_task.assert_called_once() + self.assertEquals(kwargs['override_correspondent_id'], None) + def test_filters(self): From 3850149ebbe3e7d8664b6c5c18a7524621a9de3a Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 16:07:31 +0100 Subject: [PATCH 105/121] documentation --- docs/administration.rst | 10 ++++++++-- docs/changelog.rst | 15 ++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/docs/administration.rst b/docs/administration.rst index 2acae86f0..001d608e1 100644 --- a/docs/administration.rst +++ b/docs/administration.rst @@ -349,10 +349,11 @@ This command creates PDF/A documents for your documents. .. code:: - document_archiver --overwrite + document_archiver --overwrite --document This command will only attempt to create archived documents when no archived -document exists yet, unless ``--overwrite`` is specified. +document exists yet, unless ``--overwrite`` is specified. If ``--document `` +is specified, the archiver will only process that document. .. note:: @@ -362,6 +363,11 @@ document exists yet, unless ``--overwrite`` is specified. at any time, since this command will skip already archived versions the next time it is run. +.. note:: + + Some documents will cause errors and cannot be converted into PDF/A documents, + such as encrypted PDF documents. The archiver will skip over these Documents + each time it sees them. .. _utilities-encyption: diff --git a/docs/changelog.rst b/docs/changelog.rst index 40b45d1b1..d5c48b2dc 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,15 +8,14 @@ Changelog paperless-ng 0.9.5 ################## -Apart from the API, this finalizes the changes I wanted to get into paperless before 1.0. The next releases will -focus on fixing bugs, minor changes to the UI, and possibly some changes to the API. - * OCR * Paperless now uses `OCRmyPDF `_ to perform OCR on documents. + It still uses tesseract under the hood, but the PDF parser of Paperless has changed considerably and + will behave different for some douments. * OCRmyPDF creates archived PDF/A documents with embedded text that can be selected in the front end. * Paperless stores archived versions of documents alongside with the originals. The originals can be - accessed on the document edit page, if available. + accessed on the document edit page. If available, a dropdown menu will appear next to the download button. * Many of the configuration options regarding OCR have changed. See :ref:`configuration-ocr` for details. * Paperless no longer guesses the language of your documents. It always uses the language that you specified with ``PAPERLESS_OCR_LANGUAGE``. Be sure to set this to the language the majority of your @@ -34,8 +33,8 @@ focus on fixing bugs, minor changes to the UI, and possibly some changes to the * The API now offers token authentication. * The endpoint for uploading documents now supports specifying custom titles, correspondents, tags and types. - This can be used by clients to override the default behavior of paperless. - * The document endpoint of API now serves document in this form: + This can be used by clients to override the default behavior of paperless. See :ref:`api-file_uploads`. + * The document endpoint of API now serves documents in this form: * correspondents, document types and tags are referenced by their ID in the fields ``correspondent``, ``document_type`` and ``tags``. The ``*_id`` versions are gone. These fields are read/write. * paperless does not serve nested tags, correspondents or types anymore. @@ -43,13 +42,15 @@ focus on fixing bugs, minor changes to the UI, and possibly some changes to the * Front end * Paperless does some basic caching of correspondents, tags and types and will only request them from the server when necessary or when entirely reloading the page. - * Document lists should be somewhat faster now, especially when lots of tags/correspondents where present. + * Document list fetching is about 10%-30% faster now, especially when lots of tags/correspondents are present. * Some minor improvements to the front end, such as document count in the document list, better highlighting of the current page, and improvements to the filter behavior. * Fixes: * A bug with the generation of filenames for files with unsupported types caused the exporter and document saving to crash. + * Mail handling no longer exits entirely when encountering errors. It will skip the account/rule/message on which the error occured. + * Assigning correspondents from mail sender names failed for very long names. Paperless no longer assigns correspondents in these cases. paperless-ng 0.9.4 ################## From 905c0909085303f30a26ca35e5641a6ed6a9f376 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 16:44:34 +0100 Subject: [PATCH 106/121] fixes for the parser. --- src/paperless_tesseract/parsers.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 454617728..ebd706cdd 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -83,13 +83,27 @@ class RasterisedDocumentParser(DocumentParser): return None def parse(self, document_path, mime_type): + mode = settings.OCR_MODE + text_original = get_text_from_pdf(document_path) has_text = text_original and len(text_original) > 50 - if settings.OCR_MODE == "skip_noarchive" and has_text: + if mode == "skip_noarchive" and has_text: + self.log("debug", + "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return + if mode in ['skip', 'skip_noarchive'] and not has_text: + # upgrade to redo, since there appears to be no text in the + # document. This happens to some weird encrypted documents or + # documents with failed OCR attempts for which OCRmyPDF will + # still report that there actually is text in them. + self.log("debug", + "No text was found in the document and skip is " + "specified. Upgrading OCR mode to redo.") + mode = "redo" + archive_path = os.path.join(self.tempdir, "archive.pdf") ocr_args = { @@ -108,12 +122,15 @@ class RasterisedDocumentParser(DocumentParser): # Mode selection. - if settings.OCR_MODE in ['skip', 'skip_noarchive']: + if mode in ['skip', 'skip_noarchive']: ocr_args['skip_text'] = True - elif settings.OCR_MODE == 'redo': + elif mode == 'redo': ocr_args['redo_ocr'] = True - elif settings.OCR_MODE == 'force': + elif mode == 'force': ocr_args['force_ocr'] = True + else: + raise ParseError( + f"Invalid ocr mode: {mode}") if self.is_image(mime_type): dpi = self.get_dpi(document_path) @@ -153,6 +170,10 @@ class RasterisedDocumentParser(DocumentParser): self.text = get_text_from_pdf(archive_path) except (InputFileError, EncryptedPdfError) as e: + + self.log("debug", + f"Encountered an error: {e}. Trying to use text from " + f"original.") # This happens with some PDFs when used with the redo_ocr option. # This is not the end of the world, we'll just use what we already # have in the document. From 34b0e7462298730d8656694463b640bae9b2eef5 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 23:07:11 +0100 Subject: [PATCH 107/121] bugfix --- .../app/components/document-list/document-list.component.ts | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src-ui/src/app/components/document-list/document-list.component.ts b/src-ui/src/app/components/document-list/document-list.component.ts index 3a4f17196..5637bff97 100644 --- a/src-ui/src/app/components/document-list/document-list.component.ts +++ b/src-ui/src/app/components/document-list/document-list.component.ts @@ -3,9 +3,6 @@ import { ActivatedRoute } from '@angular/router'; import { NgbModal } from '@ng-bootstrap/ng-bootstrap'; import { cloneFilterRules, FilterRule } from 'src/app/data/filter-rule'; import { FILTER_CORRESPONDENT, FILTER_DOCUMENT_TYPE, FILTER_HAS_TAG, FILTER_RULE_TYPES } from 'src/app/data/filter-rule-type'; -import { PaperlessCorrespondent } from 'src/app/data/paperless-correspondent'; -import { PaperlessDocumentType } from 'src/app/data/paperless-document-type'; -import { PaperlessTag } from 'src/app/data/paperless-tag'; import { SavedViewConfig } from 'src/app/data/saved-view-config'; import { DocumentListViewService } from 'src/app/services/document-list-view.service'; import { DOCUMENT_SORT_FIELDS } from 'src/app/services/rest/document.service'; @@ -49,11 +46,12 @@ export class DocumentListComponent implements OnInit { this.displayMode = localStorage.getItem('document-list:displayMode') } this.route.paramMap.subscribe(params => { - this.filterRules = this.list.filterRules if (params.has('id')) { this.list.savedView = this.savedViewConfigService.getConfig(params.get('id')) + this.filterRules = this.list.filterRules } else { this.list.savedView = null + this.filterRules = this.list.filterRules this.showFilter = this.filterRules.length > 0 } this.list.clear() From cd5587b51182526c8d26f632e2cc839b38393628 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Fri, 4 Dec 2020 23:16:04 +0100 Subject: [PATCH 108/121] bugfix --- .../src/app/components/document-list/document-list.component.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src-ui/src/app/components/document-list/document-list.component.ts b/src-ui/src/app/components/document-list/document-list.component.ts index 5637bff97..fe6c8a894 100644 --- a/src-ui/src/app/components/document-list/document-list.component.ts +++ b/src-ui/src/app/components/document-list/document-list.component.ts @@ -49,6 +49,7 @@ export class DocumentListComponent implements OnInit { if (params.has('id')) { this.list.savedView = this.savedViewConfigService.getConfig(params.get('id')) this.filterRules = this.list.filterRules + this.showFilter = false } else { this.list.savedView = null this.filterRules = this.list.filterRules From 17af5814259751955fd37c9e7ae7282fab4c5f72 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 00:37:05 +0100 Subject: [PATCH 109/121] bugfix --- src/documents/index.py | 5 +++++ .../management/commands/document_archiver.py | 17 +++++++++++------ src/documents/tests/test_management_archiver.py | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/documents/index.py b/src/documents/index.py index b4d6e1c51..53bf34542 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -82,6 +82,10 @@ def open_index(recreate=False): def update_document(writer, doc): + # TODO: this line caused many issues all around, since: + # We need to make sure that this method does not get called with + # deserialized documents (i.e, document objects that don't come from + # Django's ORM interfaces directly. logger.debug("Indexing {}...".format(doc)) tags = ",".join([t.name for t in doc.tags.all()]) writer.update_document( @@ -98,6 +102,7 @@ def update_document(writer, doc): def remove_document(writer, doc): + # TODO: see above. logger.debug("Removing {} from index...".format(doc)) writer.delete_by_term('id', doc.pk) diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index aba2ea693..2e7e7b34d 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -23,7 +23,9 @@ from ...parsers import get_parser_class_for_mime_type logger = logging.getLogger(__name__) -def handle_document(document): +def handle_document(document_id): + document = Document.objects.get(id=document_id) + mime_type = document.mime_type parser_class = get_parser_class_for_mime_type(mime_type) @@ -98,9 +100,12 @@ class Command(Renderable, BaseCommand): else: documents = Document.objects.all() - documents_to_process = list(filter( - lambda d: overwrite or not d.archive_checksum, - documents + document_ids = list(map( + lambda doc: doc.id, + filter( + lambda d: overwrite or not d.archive_checksum, + documents + ) )) logging.getLogger().handlers[0].level = logging.ERROR @@ -108,7 +113,7 @@ class Command(Renderable, BaseCommand): list(tqdm.tqdm( pool.imap_unordered( handle_document, - documents_to_process + document_ids ), - total=len(documents_to_process) + total=len(document_ids) )) diff --git a/src/documents/tests/test_management_archiver.py b/src/documents/tests/test_management_archiver.py index ec4fc5ac4..fdb588acf 100644 --- a/src/documents/tests/test_management_archiver.py +++ b/src/documents/tests/test_management_archiver.py @@ -32,7 +32,7 @@ class TestArchiver(DirectoriesMixin, TestCase): shutil.copy(sample_file, os.path.join(self.dirs.originals_dir, "0000001.pdf")) self.make_models() - handle_document(self.d1) + handle_document(self.d1.pk) doc = Document.objects.get(id=self.d1.id) From d9a06d67ccd31cb334dce0f1aa52689e9576aa8c Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 01:21:16 +0100 Subject: [PATCH 110/121] bugfix --- .../management/commands/document_archiver.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index 2e7e7b34d..7b9a123d9 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -5,9 +5,9 @@ import logging import os import shutil import uuid -from time import sleep import tqdm +from django import db from django.conf import settings from django.core.management.base import BaseCommand from django.db import transaction @@ -108,12 +108,21 @@ class Command(Renderable, BaseCommand): ) )) - logging.getLogger().handlers[0].level = logging.ERROR - with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool: - list(tqdm.tqdm( - pool.imap_unordered( - handle_document, - document_ids - ), - total=len(document_ids) - )) + # Note to future self: this prevents django from reusing database + # conncetions between processes, which is bad and does not work + # with postgres. + db.connections.close_all() + + try: + + logging.getLogger().handlers[0].level = logging.ERROR + with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool: + list(tqdm.tqdm( + pool.imap_unordered( + handle_document, + document_ids + ), + total=len(document_ids) + )) + except KeyboardInterrupt: + print("Aborting...") From a23af4b1e962d18aae29bfa6737589deb6a1fa8d Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 01:23:17 +0100 Subject: [PATCH 111/121] removed obsolete option --- docker/docker-compose.env | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docker/docker-compose.env b/docker/docker-compose.env index 9c13e8448..4271bce6e 100644 --- a/docker/docker-compose.env +++ b/docker/docker-compose.env @@ -32,8 +32,3 @@ # The default language to use for OCR. Set this to the language most of your # documents are written in. #PAPERLESS_OCR_LANGUAGE=eng - -# By default Paperless does not OCR a document if the text can be retrieved from -# the document directly. Set to true to always OCR documents. (i.e., if you -# know that some of your documents have faulty/bad OCR data) -#PAPERLESS_OCR_ALWAYS=true From 2e885db3e90ad9fe594ef43f04cee9778beef49c Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 12:52:49 +0100 Subject: [PATCH 112/121] documentation --- docs/api.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/api.rst b/docs/api.rst index 4c9ae0b13..81334b9ec 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -109,6 +109,7 @@ Result list object returned by the endpoint: "count": 1, "page": 1, "page_count": 1, + "corrected_query": "", "results": [ ] @@ -119,6 +120,8 @@ Result list object returned by the endpoint: the page you requested, if you requested a page that is behind the last page. In that case, the last page is returned. * ``page_count``: The total number of pages. +* ``corrected_query``: Corrected version of the query string. Can be null. + If not null, can be used verbatim to start a new query. * ``results``: A list of result objects on the current page. Result object: From 7a737327be0766e3df5d31e20dc0a8e5207019ef Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 13:19:14 +0100 Subject: [PATCH 113/121] testing the importer --- .../tests/test_management_exporter.py | 20 ++++++++++++++----- src/documents/tests/utils.py | 18 +++++++++++++++-- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index dca2114c2..284d6108d 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -9,10 +9,11 @@ from django.test import TestCase, override_settings from documents.management.commands import document_exporter from documents.models import Document, Tag, DocumentType, Correspondent -from documents.tests.utils import DirectoriesMixin +from documents.sanity_checker import check_sanity +from documents.tests.utils import DirectoriesMixin, paperless_environment -class TestExporter(DirectoriesMixin, TestCase): +class TestExportImport(DirectoriesMixin, TestCase): @override_settings( PASSPHRASE="test" @@ -23,8 +24,8 @@ class TestExporter(DirectoriesMixin, TestCase): file = os.path.join(self.dirs.originals_dir, "0000001.pdf") - Document.objects.create(checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf") - Document.objects.create(checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) + Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", id=1, mime_type="application/pdf") + Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", id=2, mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) Tag.objects.create(name="t") DocumentType.objects.create(name="dt") Correspondent.objects.create(name="c") @@ -56,6 +57,15 @@ class TestExporter(DirectoriesMixin, TestCase): checksum = hashlib.md5(f.read()).hexdigest() self.assertEqual(checksum, element['fields']['archive_checksum']) - Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf") + with paperless_environment() as dirs: + call_command('document_importer', target) + messages = check_sanity() + # everything is alright after the test + self.assertEqual(len(messages), 0, str([str(m) for m in messages])) + def test_export_missing_files(self): + + target = tempfile.mkdtemp() + call_command('document_exporter', target) + Document.objects.create(checksum="AAAAAAAAAAAAAAAAA", title="wow", filename="0000004.pdf", id=3, mime_type="application/pdf") self.assertRaises(FileNotFoundError, call_command, 'document_exporter', target) diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index 38788f6d6..7f9d50ed5 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -2,6 +2,7 @@ import os import shutil import tempfile from collections import namedtuple +from contextlib import contextmanager from django.test import override_settings @@ -24,7 +25,7 @@ def setup_directories(): os.makedirs(dirs.thumbnail_dir, exist_ok=True) os.makedirs(dirs.archive_dir, exist_ok=True) - override_settings( + dirs.settings_override = override_settings( DATA_DIR=dirs.data_dir, SCRATCH_DIR=dirs.scratch_dir, MEDIA_ROOT=dirs.media_dir, @@ -35,7 +36,8 @@ def setup_directories(): INDEX_DIR=dirs.index_dir, MODEL_FILE=os.path.join(dirs.data_dir, "classification_model.pickle") - ).enable() + ) + dirs.settings_override.enable() return dirs @@ -45,6 +47,18 @@ def remove_dirs(dirs): shutil.rmtree(dirs.data_dir, ignore_errors=True) shutil.rmtree(dirs.scratch_dir, ignore_errors=True) shutil.rmtree(dirs.consumption_dir, ignore_errors=True) + dirs.settings_override.disable() + + +@contextmanager +def paperless_environment(): + dirs = None + try: + dirs = setup_directories() + yield dirs + finally: + if dirs: + remove_dirs(dirs) class DirectoriesMixin: From 55980e79ba88dbee0018582077c399bdf2e5bed7 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 13:22:08 +0100 Subject: [PATCH 114/121] versions --- docker/hub/docker-compose.postgres.yml | 2 +- docker/hub/docker-compose.sqlite.yml | 2 +- src/paperless/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/hub/docker-compose.postgres.yml b/docker/hub/docker-compose.postgres.yml index 9848b3e05..295d981e1 100644 --- a/docker/hub/docker-compose.postgres.yml +++ b/docker/hub/docker-compose.postgres.yml @@ -15,7 +15,7 @@ services: POSTGRES_PASSWORD: paperless webserver: - image: jonaswinkler/paperless-ng:0.9.4 + image: jonaswinkler/paperless-ng:0.9.5 restart: always depends_on: - db diff --git a/docker/hub/docker-compose.sqlite.yml b/docker/hub/docker-compose.sqlite.yml index 7331b64ba..80df40596 100644 --- a/docker/hub/docker-compose.sqlite.yml +++ b/docker/hub/docker-compose.sqlite.yml @@ -5,7 +5,7 @@ services: restart: always webserver: - image: jonaswinkler/paperless-ng:0.9.4 + image: jonaswinkler/paperless-ng:0.9.5 restart: always depends_on: - broker diff --git a/src/paperless/version.py b/src/paperless/version.py index 23bd5f157..26e46fea8 100644 --- a/src/paperless/version.py +++ b/src/paperless/version.py @@ -1 +1 @@ -__version__ = (0, 9, 4) +__version__ = (0, 9, 5) From fd11417fa2c11fec1d1182d3c8792799c45c7689 Mon Sep 17 00:00:00 2001 From: Johann Bauer Date: Sat, 5 Dec 2020 13:38:09 +0100 Subject: [PATCH 115/121] Add missing step to migration guide --- docs/setup.rst | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/docs/setup.rst b/docs/setup.rst index 746c0aa0d..d3a063302 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -404,7 +404,14 @@ Migration to paperless-ng is then performed in a few simple steps: ``docker-compose.env`` to your needs. See `docker route`_ for details on which edits are advised. -6. In order to find your existing documents with the new search feature, you need +6. Since ``docker-compose`` would just use the the old paperless image, we need to + manually build a new image: + + .. code:: shell-session + + $ docker-compose build + +7. In order to find your existing documents with the new search feature, you need to invoke a one-time operation that will create the search index: .. code:: shell-session @@ -414,7 +421,7 @@ Migration to paperless-ng is then performed in a few simple steps: This will migrate your database and create the search index. After that, paperless will take care of maintaining the index by itself. -7. Start paperless-ng. +8. Start paperless-ng. .. code:: bash @@ -422,11 +429,11 @@ Migration to paperless-ng is then performed in a few simple steps: This will run paperless in the background and automatically start it on system boot. -8. Paperless installed a permanent redirect to ``admin/`` in your browser. This +9. Paperless installed a permanent redirect to ``admin/`` in your browser. This redirect is still in place and prevents access to the new UI. Clear browsing cache in order to fix this. -9. Optionally, follow the instructions below to migrate your existing data to PostgreSQL. +10. Optionally, follow the instructions below to migrate your existing data to PostgreSQL. .. _setup-sqlite_to_psql: From a2ea397587df08f0b54cd4e3e67d69632042dd45 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 13:53:03 +0100 Subject: [PATCH 116/121] docs --- docs/changelog.rst | 3 ++ docs/faq.rst | 12 +++++++ docs/troubleshooting.rst | 74 ++++++---------------------------------- 3 files changed, 26 insertions(+), 63 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index d5c48b2dc..116c2e07c 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,6 +8,9 @@ Changelog paperless-ng 0.9.5 ################## +This release concludes the big changes I wanted to get rolled into paperless. The next releases before 1.0 will +focus on fixing issues, primarily. + * OCR * Paperless now uses `OCRmyPDF `_ to perform OCR on documents. diff --git a/docs/faq.rst b/docs/faq.rst index 9a5e73ea5..887946074 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -86,3 +86,15 @@ the documentation has instructions for bare metal installs. I'm running paperless on an i3 processor from 2015 or so. This is also what I use to test new releases with. Apart from that, I also have a Raspberry Pi, which I occasionally build the image on and see if it works. + +**Q:** *How do I proxy this with NGINX?* + +.. code:: + + location / { + proxy_pass http://localhost:8000/ + } + +And that's about it. Paperless serves everything, including static files by itself +when running the docker image. If you want to do anything fancy, you have to +install paperless bare metal. diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst index 9e1c42f4a..dc5bf7f5d 100644 --- a/docs/troubleshooting.rst +++ b/docs/troubleshooting.rst @@ -29,75 +29,23 @@ Check for the following issues: Consumer fails to pickup any new files ###################################### -If you notice, that the consumer will only pickup files in the consumption +If you notice that the consumer will only pickup files in the consumption directory at startup, but won't find any other files added later, check out the configuration file and enable filesystem polling with the setting ``PAPERLESS_CONSUMER_POLLING``. +Operation not permitted +####################### -Consumer warns ``OCR for XX failed`` -#################################### +You might see errors such as: -If you find the OCR accuracy to be too low, and/or the document consumer warns -that ``OCR for XX failed, but we're going to stick with what we've got since -FORGIVING_OCR is enabled``, then you might need to install the -`Tesseract language files `_ -marching your document's languages. +.. code:: -As an example, if you are running Paperless from any Ubuntu or Debian -box, and your documents are written in Spanish you may need to run:: + chown: changing ownership of '../export': Operation not permitted - apt-get install -y tesseract-ocr-spa +The container tries to set file ownership on the listed directories. This is +required so that the user running paperless inside docker has write permissions +to these folders. This happens when pointing these directories to NFS shares, +for example. - - -Consumer dies with ``convert: unable to extent pixel cache`` -############################################################ - -During the consumption process, Paperless invokes ImageMagick's ``convert`` -program to translate the source document into something that the OCR engine can -understand and this can burn a Very Large amount of memory if the original -document is rather long. Similarly, if your system doesn't have a lot of -memory to begin with (ie. a Raspberry Pi), then this can happen for even -medium-sized documents. - -The solution is to tell ImageMagick *not* to Use All The RAM, as is its -default, and instead tell it to used a fixed amount. ``convert`` will then -break up the job into hundreds of individual files and use them to slowly -compile the finished image. Simply set ``PAPERLESS_CONVERT_MEMORY_LIMIT`` in -``/etc/paperless.conf`` to something like ``32000000`` and you'll limit -``convert`` to 32MB. Fiddle with this value as you like. - -**HOWEVER**: Simply setting this value may not be enough on system where -``/tmp`` is mounted as tmpfs, as this is where ``convert`` will write its -temporary files. In these cases (most Systemd machines), you need to tell -ImageMagick to use a different space for its scratch work. You do this by -setting ``PAPERLESS_CONVERT_TMPDIR`` in ``/etc/paperless.conf`` to somewhere -that's actually on a physical disk (and writable by the user running -Paperless), like ``/var/tmp/paperless`` or ``/home/my_user/tmp`` in a pinch. - - -DecompressionBombWarning and/or no text in the OCR output -######################################################### - -Some users have had issues using Paperless to consume PDFs that were created -by merging Very Large Scanned Images into one PDF. If this happens to you, -it's likely because the PDF you've created contains some very large pages -(millions of pixels) and the process of converting the PDF to a OCR-friendly -image is exploding. - -Typically, this happens because the scanned images are created with a high -DPI and then rolled into the PDF with an assumed DPI of 72 (the default). -The best solution then is to specify the DPI used in the scan in the -conversion-to-PDF step. So for example, if you scanned the original image -with a DPI of 300, then merging the images into the single PDF with -``convert`` should look like this: - -.. code:: bash - - $ convert -density 300 *.jpg finished.pdf - -For more information on this and situations like it, you should take a look -at `Issue #118`_ as that's where this tip originated. - -.. _Issue #118: https://github.com/the-paperless-project/paperless/issues/118 +Ensure that `chown` is possible on these directories. From 3f6da69c8f1d793c879881473696a265003c691a Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 13:53:11 +0100 Subject: [PATCH 117/121] docs config --- docs/conf.py | 51 +++++---------------------------------------------- 1 file changed, 5 insertions(+), 46 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 7ebc82ea7..b2442ddc9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,48 +1,21 @@ -# -*- coding: utf-8 -*- -# -# Paperless documentation build configuration file, created by -# sphinx-quickstart on Mon Oct 26 18:36:52 2015. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. +import sphinx_rtd_theme + __version__ = None exec(open("../src/paperless/version.py").read()) -# Believe it or not, this is the officially sanctioned way to add custom CSS. -def setup(app): - app.add_stylesheet("custom.css") - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.imgmath', 'sphinx.ext.viewcode', + 'sphinx_rtd_theme', ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +# templates_path = ['_templates'] # The suffix of source filenames. source_suffix = '.rst' @@ -115,7 +88,7 @@ pygments_style = 'sphinx' # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -195,20 +168,6 @@ html_static_path = ['_static'] # Output file base name for HTML help builder. htmlhelp_basename = 'paperless' - -# -# Attempt to use the ReadTheDocs theme. If it's not installed, fallback to -# the default. -# - -try: - import sphinx_rtd_theme - html_theme = "sphinx_rtd_theme" - html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] -except ImportError as e: - print("error " + str(e)) - pass - # -- Options for LaTeX output --------------------------------------------- latex_elements = { From c88c34661e1b5fce8247fa1162c9105afb11330c Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 14:00:02 +0100 Subject: [PATCH 118/121] docs --- docs/setup.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/setup.rst b/docs/setup.rst index 3cd1cf60a..c11cf9e95 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -231,6 +231,7 @@ writing. Windows is not and will never be supported. * ``unpaper`` * ``ghostscript`` * ``icc-profiles-free`` + * ``qpdf`` * ``liblept5`` * ``libxml2`` * ``pngquant`` From 45a56c426dd060dc115357bc867c74f32745bff1 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sat, 5 Dec 2020 14:00:27 +0100 Subject: [PATCH 119/121] dependencies --- Pipfile | 2 +- Pipfile.lock | 20 ++++++++++---------- docker/local/Dockerfile | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Pipfile b/Pipfile index c0728fddf..2e86f2a42 100644 --- a/Pipfile +++ b/Pipfile @@ -37,7 +37,7 @@ scikit-learn="~=0.23.2" whitenoise = "~=5.2.0" watchdog = "*" whoosh="~=2.7.4" -inotifyrecursive = ">=0.3.4" +inotifyrecursive = "~=0.3.4" ocrmypdf = "*" tqdm = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 71b6c0811..6158a70e0 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "bb0b90c2ee89521c6dcd24375b67b52be5a4a786297923519a5abaafe0fe5d0e" + "sha256": "b10db53eb22d917723aa6107ff0970dc4e2aa886ee03d3ae08a994a856d57986" }, "pipfile-spec": 6, "requires": { @@ -365,11 +365,11 @@ }, "ocrmypdf": { "hashes": [ - "sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3", - "sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928" + "sha256:91e7394172cedb3be801a229dbd3d308fb5ae80cbc3a77879fa7954beea407b1", + "sha256:e550b8e884150accab7ea41f4a576b5844594cb5cbd6ed514fbf1206720343ad" ], "index": "pypi", - "version": "==11.3.3" + "version": "==11.3.4" }, "pathtools": { "hashes": [ @@ -763,11 +763,11 @@ }, "tqdm": { "hashes": [ - "sha256:5c0d04e06ccc0da1bd3fa5ae4550effcce42fcad947b4a6cafa77bdc9b09ff22", - "sha256:9e7b8ab0ecbdbf0595adadd5f0ebbb9e69010e0bd48bbb0c15e550bf2a5292df" + "sha256:38b658a3e4ecf9b4f6f8ff75ca16221ae3378b2e175d846b6b33ea3a20852cf5", + "sha256:d4f413aecb61c9779888c64ddf0c62910ad56dcbe857d8922bb505d4dbff0df1" ], "index": "pypi", - "version": "==4.54.0" + "version": "==4.54.1" }, "tzlocal": { "hashes": [ @@ -961,11 +961,11 @@ }, "faker": { "hashes": [ - "sha256:2ba20a4438429cb08d729175d7bb0435ef3c2c4cedc7b1ceb703ee6da8dad906", - "sha256:6279746aed175a693108238e6d1ab8d7e26d0ec7ff8474f61025b9fdaae15d65" + "sha256:7bca5b074299ac6532be2f72979e6793f1a2403ca8105cb4cf0b385a964469c4", + "sha256:fb21a76064847561033d8cab1cfd11af436ddf2c6fe72eb51b3cda51dff86bdc" ], "markers": "python_version >= '3.5'", - "version": "==4.18.0" + "version": "==5.0.0" }, "filelock": { "hashes": [ diff --git a/docker/local/Dockerfile b/docker/local/Dockerfile index 4df37bbea..9b110c622 100644 --- a/docker/local/Dockerfile +++ b/docker/local/Dockerfile @@ -21,6 +21,7 @@ RUN apt-get update \ libxml2 \ optipng \ pngquant \ + qpdf \ sudo \ tesseract-ocr \ tesseract-ocr-eng \ From dd48ef07b6c558838961d195e3a30748f249c7d6 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sun, 6 Dec 2020 01:25:12 +0100 Subject: [PATCH 120/121] added a welcome widget --- src-ui/src/app/app.module.ts | 4 ++- .../dashboard/dashboard.component.html | 8 ++---- .../welcome-widget.component.html | 16 +++++++++++ .../welcome-widget.component.scss | 0 .../welcome-widget.component.spec.ts | 25 ++++++++++++++++++ .../welcome-widget.component.ts | 15 +++++++++++ src-ui/src/assets/save-filter.png | Bin 0 -> 8267 bytes 7 files changed, 61 insertions(+), 7 deletions(-) create mode 100644 src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.html create mode 100644 src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.scss create mode 100644 src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.spec.ts create mode 100644 src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.ts create mode 100644 src-ui/src/assets/save-filter.png diff --git a/src-ui/src/app/app.module.ts b/src-ui/src/app/app.module.ts index 7f2e8414e..1a2a76908 100644 --- a/src-ui/src/app/app.module.ts +++ b/src-ui/src/app/app.module.ts @@ -45,6 +45,7 @@ import { SavedViewWidgetComponent } from './components/dashboard/widgets/saved-v import { StatisticsWidgetComponent } from './components/dashboard/widgets/statistics-widget/statistics-widget.component'; import { UploadFileWidgetComponent } from './components/dashboard/widgets/upload-file-widget/upload-file-widget.component'; import { WidgetFrameComponent } from './components/dashboard/widgets/widget-frame/widget-frame.component'; +import { WelcomeWidgetComponent } from './components/dashboard/widgets/welcome-widget/welcome-widget.component'; @NgModule({ declarations: [ @@ -82,7 +83,8 @@ import { WidgetFrameComponent } from './components/dashboard/widgets/widget-fram SavedViewWidgetComponent, StatisticsWidgetComponent, UploadFileWidgetComponent, - WidgetFrameComponent + WidgetFrameComponent, + WelcomeWidgetComponent ], imports: [ BrowserModule, diff --git a/src-ui/src/app/components/dashboard/dashboard.component.html b/src-ui/src/app/components/dashboard/dashboard.component.html index 3e6438181..627e7ff22 100644 --- a/src-ui/src/app/components/dashboard/dashboard.component.html +++ b/src-ui/src/app/components/dashboard/dashboard.component.html @@ -4,11 +4,7 @@
- -

This space is reserved to display your saved views. Go to your documents and save a view - to have it displayed - here!

-
+ @@ -22,4 +18,4 @@
-
\ No newline at end of file +
diff --git a/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.html b/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.html new file mode 100644 index 000000000..0caf55f11 --- /dev/null +++ b/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.html @@ -0,0 +1,16 @@ + + + + +

Paperless is running! :)

+

You can start uploading documents by dropping them in the file upload box to the right or by dropping them in the configured consumption folder and they'll start showing up in the documents list. + After you've added some metadata to your documents, use the filtering mechanisms of paperless to create custom views (such as 'Recently added', 'Tagged TODO') and have them displayed on the dashboard instead of this message.

+

Paperless offers some more features that try to make your life easier, such as:

+
    +
  • Once you've got a couple documents in paperless and added metadata to them, paperless can assign that metadata to new documents automatically.
  • +
  • You can configure paperless to read your mails and add documents from attached files.
  • +
+

Consult the documentation on how to use these features. The section on basic usage also has some information on how to use paperless in general.

+
+ +
\ No newline at end of file diff --git a/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.scss b/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.scss new file mode 100644 index 000000000..e69de29bb diff --git a/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.spec.ts b/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.spec.ts new file mode 100644 index 000000000..5e8c2494b --- /dev/null +++ b/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.spec.ts @@ -0,0 +1,25 @@ +import { ComponentFixture, TestBed } from '@angular/core/testing'; + +import { WelcomeWidgetComponent } from './welcome-widget.component'; + +describe('WelcomeWidgetComponent', () => { + let component: WelcomeWidgetComponent; + let fixture: ComponentFixture; + + beforeEach(async () => { + await TestBed.configureTestingModule({ + declarations: [ WelcomeWidgetComponent ] + }) + .compileComponents(); + }); + + beforeEach(() => { + fixture = TestBed.createComponent(WelcomeWidgetComponent); + component = fixture.componentInstance; + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); +}); diff --git a/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.ts b/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.ts new file mode 100644 index 000000000..71a87189c --- /dev/null +++ b/src-ui/src/app/components/dashboard/widgets/welcome-widget/welcome-widget.component.ts @@ -0,0 +1,15 @@ +import { Component, OnInit } from '@angular/core'; + +@Component({ + selector: 'app-welcome-widget', + templateUrl: './welcome-widget.component.html', + styleUrls: ['./welcome-widget.component.scss'] +}) +export class WelcomeWidgetComponent implements OnInit { + + constructor() { } + + ngOnInit(): void { + } + +} diff --git a/src-ui/src/assets/save-filter.png b/src-ui/src/assets/save-filter.png new file mode 100644 index 0000000000000000000000000000000000000000..dcaa41714c78a70b8568e162a0e8afac86fe8168 GIT binary patch literal 8267 zcmch6RZv`Aux_y65JCtV^e4dw2@FAl6D$OGcXyZI8VJGNZD4SBCpZCyA-FTR4=~uB z|2~|$x9XmA?$deLySjRJ?Y-7s-Tn2~9i^-!gO5Xj^Wwz|{4cVSDlcB30Fm+>7Ao@Y zv3L21R5)5oNGMyIn!b3!l;EBqD32!gCahaBq<{z)LyApBvR3vVrpV% z84*8+>)OSF!A7QZT9JHRIL|>%!Qfz8rr^l1?jB7B452e@7nS3QNr2Tt8W)q&o}eqzah5T%oSt`i8A7HW%NKI=<3zG_Y`rH*P+EB75ZM{ zKm}bdIsQ6p%4p9p-)lEN{aZx9W?9_4}0+krdI*D%j(1myW;g{bJi9qPkJT{|3LTPW0CoK=;q9-?z`dLKuEAV5apjJctx~d7pTu;*Q+p?ER8A$x~+H$D#{%{)_WU zd<9Q6`qe&XF;)V>|F3XshkSAFtbW0f045>X7m}tr_e~-*c)Afp{~2?Ck#qfW!-4dv(o`IQv1lmGApfC zQ&L)S8dG+ciuojzQXAhiEmx&bt~T^eEPY5EOZp+lP5!9+=OfB99}x7U{yUR0nRI_B zWq)_j{H%IWZat8P2baH+V!`nYQ(ko1<;K73zTWq*=cU|*D-L&KlJILUA&%vMCKeCW zpxW}7$#cIhCJAvOZpRKdw#x(F75>D%%b=s-(Y>KJ&}r z0LG$=(%dS*idcN*xY zmb`Il)e9?__=b*7#vs=+n!z&U(PC+w7efRoP*RckS`qiWSW%%m`t8_lDT6V&@$Uq^ zxLR|9$~0P>)HTjfryb*X8(lG0K0$91y`+Ewvv^Vee+a3<)F__@Kh5vXX9vh&RM4&& zk-KaB63(;9zP>85oTF5AomDar-rd>| zMe(65uR7ii`;=W@IlfV9Czmo)f+x1Dr?m^eEd&h3CFa zT$kIES>`%I7cuA3=6qVd7O%&@ZFksl4V|zDAZpkq*kVR;Dr{{V=6dwXg5-u;D@ehL z3z-U!l6X|u?6#!UxYvaC>P_eE_z?hV3!UCVxm15<{7umGwPq^>I4AM6e?QeXC+_Rd zs*>K2Uz-31b^fYPJT+&u&9Ntx7)B96TSz%ci|zQ>E%L}V0am$G!9)<4u|h2dXm`F8 zaed*c>5SuxE9@UKjQk`&u~@Ucg0^>)s-9$}-z%XlWb5n#UkN)EYPOwqV08_r5|=b~ z(0VpjF8`ETc%;(ufH~#=j>jw3LR&h(N48rm<2ZKP1XT(du(HkvUyx17}RtK)V~ z_mkA8@=we^e*9W!=So-D8%STy*_0A;L+%{;5!Xse!+)P`)2JL7iL>$)8~)`0s0-kEtC_&hm{5rR90lfUD3 zm9AZ_NYiBn3--(jF~x~HI5ri#e26m;5rB&pBIBBQHu|p^1zq9+yH7NeOfz%ws%Eht zwKyGggs4zJmO0jCj72_r6nEoAvO{%?p@F{c@%|C{Itgrd{iSEDGFy=a1OiQ`$Rs!b zw#2*U2IM@2hX?12^&{G@A$r~p&Q;YBr?6?RF6sc!8Dc4SdaYz-HMA`=5N-EAgyDY= zV$gjs?|d<5NJ9_@A_cVUKwcB0?RwRmOyI^Bn3_8`402U%jhkin4p@!2N`#77R`_eZ z64%6R<~0OcL%Mb=fT$g4f^WV_r+OIRV`HtH&6F&xIkXD2^z|5T#ktb})?6O<(wvQn zH5~Mn;xAs$5>Fnt+#WnkcE`vHmmy-1J7zGr_N>}R!}FwbpZ+3FXl-j?a!9@X*B$8Y2TgW^_ zT#t9{71{PXbWd*=3zfXD&+A>i3akNIu9>+94cyH>u73oM2ebJCkBP7D<^S5auOAOm z<}HQhIcV30*#t%Q5e+0v#akPm9d<4S@E1|xl96Li2Ks*S_N(s1_eHk5=j`V72yhtlwYvJHp ze}``z59*G~KkzTGbErhp?W2@@JV)N&b{ajR0Z{h|(cSxMzVOoTTy~yO4)w%NeP}eX zFd+ze*95{uo?l;YU$8aq9OXt!QlU?UJ|!-ZOe!S^s%c_0uFYCiEb>`W4GtxJ4FGz3T~?Qvi5 z6eJdezr+vOFB&frB#*jk#XnmRAoYbi`{%^)aQUe?dndG(-)$4Adb*9|X;#0T(0LeY zg2o#lY}a-}co-}-)KUR!7tzR27Q71MNjKc0%Q8>_#cP_ZznYuqRjhwkg(WSPfksDG z-52i@QyoDQ@x{xD{{cQ1$su$7&01O-b%Drga)zD8GdDIkwxYTE_fDNW5YR*WBgY-~ z(3=n!^Y@nI%L*RC{#jn|51E- zrR4%NBe);c{`K+H?aKPt7Dn||(=kEoYCdo+Z<7294r>zwP+9q%&E5y<^TT#wlNw7E z`*i@w4J(&EHuY!Dd3R?g(<9SsGo6734R&KOz5I2q$3ZzhE*+EmE(0;IW-$LP zRu03HY`l5Xr+gc7T@Uap+3XiHc1GOdJ{Dl35B*!(2o$@tF8Ab3ug68j0Sd zLDYGG5oeYx~_O`OG{RDR)WLcyqGP_%K07iL2DGFgjHqe;xZ zqxwa-?2|!Irl;-Dk-kv9ajE1VSj0)Y#%An$StV&*&%M5mYpNnABg;bRp)D z4E?vmuRV4j6qvP*tU}OMsH29r+rB^wRh;H-8DI*)Z zLz*~oaE^m!nW2@wWUl9^_s?DFZ(zd8TJQm;qe6wfo%Z+ggpKzNs{xG_pmLJGOFVvj zq6a7!F>?6oh7|KktJI1gSoRrlTRJMuK&2)9l)WihKIoRI$U zpHL_zK=lBvIMbWxz)AfUw>y__-)a@!*Vi}ewjPlX#uEgd5F8k!&Ey=#citBinC?#a zkI&Akvb`-WDdCelCmx{@PMe{|6_}Avtsc+3zlD8EP?ILFzp-Pnu(Gasxvta@C5WND zoJzkh+6|T!L{E`{Tz?y6{ygVWh**HQJgK(rNrT?ibpO3TO?1`Z{sPK7(=GC%{@}dVM&RB_py|? z$Jp4O4yzrFj?ZH02TM75xn><430i*Z>-m&Ghn?-<4g>-b9{w`ubB)d5MdRUoX=&;6 z;orMLdKn6`bwLh+mogolr~{5$OOD7q6A@1p7DLKm;A1&^FOAwiY%#HXHTg2I(K6XW zA=70y)@9mvb6}0fQlU_UYGL|k_f;`t1mTnS_SuuE48fEC1#CZYSVuohl0N-@V0Jhz zmx|YBTaa?~2^OMwRr#sjGDpyrrDyHA5n`LC>BzcW#=ejW*TR8hYV611B0gG@mfox1 z7$Y7A<%x&liwTLw#>V!GOB8Z$uYqf;(cS+0qmbuE1Oo)?ot=#iLS9ES^@1H- zVb#JSOLw@}bCAS=tG-^AVW9iYLuoFBu)}8YnCAOf8t9PCE40ocwO;r;$#*t(mgJ_U zf@5Tq?z%$@Wq;}4;J9CNIDL`mPkmky;im6fSyALj?Jc`jGMO1A88b?0Om_EBatJ93 za{T>ldfEj7vG_2vvQ|V!60P~&`ztHc#-@}HF#_0py#LAp08`pLma@e^NC5`n#+OIh zJdo?pe3~t#qH;7&6Xj0&4c#PXj|8-kJ!R z{ayze1cD(DZ(rZe`t#!-r(Kw$1_lPt23JClRE_}wPkmB=o}UX)sONoPHx|JO;ViVU zz|Y^G_9r=cIw!mJ&oghKwuD*YZ)=56>|1*!v!LG6Dujw;#v!yio{TFP9E~F48>&ni zM1-!3&4UCw{i4||jsKgS{|8O~Zv|~UJUk?g(DKKzs`e)Z?#!o}9swn@hV2@_{`ez>@} zEvn^}bpdLx{Sz|0Zexzi#oi%df`^AkQxm$mY4X1{SOWffYyJ;$#3RKh1im)fiVY-B2%&|unD1W!B*)zk!Y*xEQ~ zvqMlbq0ox2&!3<9nk%)BCT@$i)e>acBEa7az`tB{u5+{66dIK$)U;}eY?vL5uunX6a@b1x_Kq4S-2V6r_?9Ly7^kGX)fsL3>@Bs^%kW`&MXy10 zny%nXGh5KNtuq^(Qdb3?Jce=z3NDR}#UQ;f&Wjw`Um`XYlaL63*Zr!yaEM~!1oGgM zPm40J%J^!#$oUMMoy0SJpA&=UCC(YV#6Xu$nmsv&5-TwCV1csilm09$EU*(5e^zFE z9R<&5Yx9whLE2+b!z#D5m}of*lHhjTIYx3s^zTaE(9IbCCxY6k!?N6W8g>k8^7N=L zBN@l$J0zARQbYTdrmY+Qh`d^9t@fwRuM2OF`P|*zE7lg> zuC}e4#O+Tsp_P87Xu2fKZh_RN4Ebd(1AlsRhPVE(bC*|^6%DLRsE?^+)4qj%p`h4y zX_eKpw6N*ZSS_}ZDp(1ma`SLWyI;W5wY9C)%1W4U;U>o3+$3J_iiwKJrrEWoIgHi3;@GBCDc94} zHJ!&5lo-;PPq#KY%Kg#o#gh1O0W`7S)WlIS{iBkRnHzvN1l$w>`{7?6(&-O*JP>f! zm(ywM&IL3q%PSH`4VILYa40F+$$P#K(u$EE3l2fu#<|OGXZG(D!pFla)Bd>{TS*mF z)H~BRKCqBxP$bQy(3Xj6s>BeKkXg?GEFo!$9?x1X2C&T;XlNP+Zr|?DT`4SFL~i77 z$;1Z_WpxC1$7LgzzU6{19Y-c$KjRg5Fad-XI(oIV4=qO;az01V28tfro@D)+xu}aC zUH|<2>(>lyfl>@b5n5-tMn_D%)z*fnyO1O6#-ns269zsGmhM&mPOdnN=_siwz(R;WD9|(Rw!n_|M5ex>i<-b4C2(B*_mT@<6 zQahUX9?;dlqPG!AB!Hj7>V;Ljv$MNvG1p{WdmoR+*QDVHHbB|hsP29qoSf(QhhO%e z*HdYHf7;_QsaEWt{Cu5^UW5`yU*g7uM-bk5&3`1zv@*0W6^j&?2;Qg`aQmG{*zAWL z^wC}(dr+;uF5tw3GpL*0c=tjd#SiQ@Wu}_aYEpJXIPW%v3&IQ){y4ZHBB;lQ$LiapwY1&W7H8_pijqJPA~squ+O+9A!(XToH?=?q z2@&JP`Ju(JSq7zJ>sB$eVC45uvTX*=YL>7)%RK}sDQzm&s|6hFwvPwe4-P*9f%xRF z(DjjA_I>HaDCp-?5LMsRP*)PFSl9*!B=zO(-Tp%mW3n783rKEkl&9XMoPpOX>9eD? zm6evJ<{r#QMw5b^d>+cWwt{+(3tO98BA>5d(9n3xi5iLmMtHR+M}t_(6%glpOXW9V z{l*>b6+AMZS`_&`E*g@HF_xplhooDI)r60n7j=@AdV-jaD7qH95*>nwhG7XCa}5Z} zD}`9a^?9yF%Kn#VaTy8y5yy9zuVQ~9qmF=v0(2?U*MHc_yi*oTF+CNZVN`L!=*awF$P6svZAZsofAyJfO%qq=n)RpmcFKo{Z1~(=0{x{^*dmIf|)KaCv|wl zeK>CgtlxTidiv}>7t>I`X<)V=pCh35(bp%%CtYMLNtBo~)jU`AB<7v`qKoN+5UvFNtr2lQL1v>U0Z~iPzjq|b= zB549mGBGok(lEXG@Bz~+sVB6sP{Kfru2F_`J~~={tQ^%+CZg^lI)j2v#yvJ8IlH1( zBRfb^G5aHY_gIj$O0R+lCqFloVY{p)^MLmZ*38Kx?0-_w@C|3h+QUJ_E=M8kAgJE; z(&f_qq0&p&=*6!zH%Dbz&J`23SLm3C{)B(r&xgR0{ODm(u}4|9f1DE-G7ws1lX`6w z_)CPqvxN~J?}(B;6+;UCe5 zI++<-E*{=%-!s{?u^o;FsH#j{u8-pw-$H7yljq6Nz7w9Pm(KU*nb&vGER9;=pY1DA(tP>Q{99F5%X=@Ny_O+hDtdI( zJ(G`;oPq!!KV8T;c6YSD|K+3m!}iq(F|Q?&SkR)fx_)8jy3aWNF$My@*UPYo+jYMm zxQN@f5dEzC{1d}INbELh_Svebzv^tvF<{!xkx`h(cMDV3*s#2_ayCsp+g+&;G1k8VE8)9Uz{`?;U#qobz`o{_s5)u-fIn+h^U|BY!`*8?QU-yB*~N;xK76LbwLX@ z%M#PuNUw!i86xV1zM^3$OB*RKwLbb3+&jAVo$A@HQawyIzCC`PtfPL&w#|Ox`>;~_ zhHjv2mU$4_=lDNnPXBE&`#)iUs>tKwe|J;X0ruFkFY z&yDhMJDDWaXkXPpxM$_%h{Od#84Ui*;`rt{Pg<=-o~9K8Iih~?MM_DsO57;;e*we= BL(BjG literal 0 HcmV?d00001 From 4d15831e30292d6f914e59b8c4db3e6956b439ad Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Sun, 6 Dec 2020 01:37:44 +0100 Subject: [PATCH 121/121] fixes #87 --- docker/docker-entrypoint.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh index dfa7cfc65..e2338842b 100644 --- a/docker/docker-entrypoint.sh +++ b/docker/docker-entrypoint.sh @@ -23,8 +23,9 @@ wait_for_postgres() { echo "Waiting for PostgreSQL to start..." host="${PAPERLESS_DBHOST}" + port="${PAPERLESS_DBPORT}" - while !