From 9a33f191a7dc9fa796cf498650eebe1b744c0217 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:45:21 +0100 Subject: [PATCH 01/42] added archive directory. --- src/documents/consumer.py | 1 + src/paperless/settings.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 65febc937..b273d331d 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -65,6 +65,7 @@ class Consumer(LoggingMixin): os.makedirs(settings.SCRATCH_DIR, exist_ok=True) os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) + os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) def try_consume_file(self, path, diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 1432dc5ec..66f9fee4b 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -49,6 +49,7 @@ STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "sta MEDIA_ROOT = os.getenv('PAPERLESS_MEDIA_ROOT', os.path.join(BASE_DIR, "..", "media")) ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals") +ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive") THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails") DATA_DIR = os.getenv('PAPERLESS_DATA_DIR', os.path.join(BASE_DIR, "..", "data")) From 8069c2eb6a74de095e1d25de287bacb5bfa60107 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:47:01 +0100 Subject: [PATCH 02/42] add support for archive files. --- src/documents/consumer.py | 17 +++++++++++++---- src/documents/models.py | 13 +++++++++++++ src/documents/parsers.py | 3 +++ src/documents/signals/handlers.py | 16 +++++++++++----- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b273d331d..b6a0a5912 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -134,6 +134,7 @@ class Consumer(LoggingMixin): self.log("debug", "Parsing {}...".format(self.filename)) text = document_parser.get_text() date = document_parser.get_date() + archive_path = document_parser.get_archive_path() except ParseError as e: document_parser.cleanup() raise ConsumerError(e) @@ -178,8 +179,16 @@ class Consumer(LoggingMixin): # place. If this fails, we'll also rollback the transaction. create_source_path_directory(document.source_path) - self._write(document, self.path, document.source_path) - self._write(document, thumbnail, document.thumbnail_path) + + self._write(document.storage_type, + self.path, document.source_path) + + self._write(document.storage_type, + thumbnail, document.thumbnail_path) + + if archive_path and os.path.isfile(archive_path): + self._write(Document.STORAGE_TYPE_UNENCRYPTED, + archive_path, document.archive_path) # Delete the file only if it was successfully consumed self.log("debug", "Deleting file {}".format(self.path)) @@ -258,10 +267,10 @@ class Consumer(LoggingMixin): for tag_id in self.override_tag_ids: document.tags.add(Tag.objects.get(pk=tag_id)) - def _write(self, document, source, target): + def _write(self, storage_type, source, target): with open(source, "rb") as read_file: with open(target, "wb") as write_file: - if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: + if storage_type == Document.STORAGE_TYPE_UNENCRYPTED: write_file.write(read_file.read()) return self.log("debug", "Encrypting") diff --git a/src/documents/models.py b/src/documents/models.py index 8e0435647..c1ab9a44d 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -224,6 +224,19 @@ class Document(models.Model): def source_file(self): return open(self.source_path, "rb") + @property + def archive_path(self): + fname = "{:07}{}".format(self.pk, ".pdf") + + return os.path.join( + settings.ARCHIVE_DIR, + fname + ) + + @property + def archive_file(self): + return open(self.archive_path, "rb") + @property def file_name(self): return slugify(str(self)) + self.file_type diff --git a/src/documents/parsers.py b/src/documents/parsers.py index eb8ccf45e..3ad60dccd 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -141,6 +141,9 @@ class DocumentParser(LoggingMixin): self.tempdir = tempfile.mkdtemp( prefix="paperless-", dir=settings.SCRATCH_DIR) + def get_archive_path(self): + return None + def get_thumbnail(self): """ Returns the path to a file we can use as a thumbnail for this document. diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index f83f88783..9672b884b 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -168,11 +168,17 @@ def run_post_consume_script(sender, document, **kwargs): @receiver(models.signals.post_delete, sender=Document) def cleanup_document_deletion(sender, instance, using, **kwargs): - for f in (instance.source_path, instance.thumbnail_path): - try: - os.unlink(f) - except FileNotFoundError: - pass # The file's already gone, so we're cool with it. + for f in (instance.source_path, + instance.archive_path, + instance.thumbnail_path): + if os.path.isfile(f): + try: + os.unlink(f) + except OSError as e: + logging.getLogger(__name__).warning( + f"While deleting document {instance.file_name}, the file " + f"{f} could not be deleted: {e}" + ) delete_empty_directories(os.path.dirname(instance.source_path)) From 95ec520f137cfc468fd976b5be40fcc452a13ccb Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:48:36 +0100 Subject: [PATCH 03/42] api serves archive files by default. --- src/documents/views.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/documents/views.py b/src/documents/views.py index 14323e933..5c8a0d9b9 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,3 +1,5 @@ +import os + from django.db.models import Count, Max from django.http import HttpResponse, HttpResponseBadRequest, Http404 from django.views.decorators.cache import cache_control @@ -126,15 +128,25 @@ class DocumentViewSet(RetrieveModelMixin, index.remove_document_from_index(self.get_object()) return super(DocumentViewSet, self).destroy(request, *args, **kwargs) - def file_response(self, pk, disposition): - doc = Document.objects.get(id=pk) + @staticmethod + def original_requested(request): + return ( + 'original' in request.query_params and + request.query_params['original'] == 'true' + ) - if doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: + def file_response(self, pk, request, disposition): + doc = Document.objects.get(id=pk) + mime_type = doc.mime_type + if not self.original_requested(request) and os.path.isfile(doc.archive_path): + file_handle = doc.archive_file + mime_type = 'application/pdf' + elif doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: file_handle = doc.source_file else: file_handle = GnuPG.decrypted(doc.source_file) - response = HttpResponse(file_handle, content_type=doc.mime_type) + response = HttpResponse(file_handle, content_type=mime_type) response["Content-Disposition"] = '{}; filename="{}"'.format( disposition, doc.file_name) return response @@ -152,7 +164,8 @@ class DocumentViewSet(RetrieveModelMixin, @action(methods=['get'], detail=True) def preview(self, request, pk=None): try: - response = self.file_response(pk, "inline") + response = self.file_response( + pk, request, "inline") return response except FileNotFoundError: raise Http404("Document source file does not exist") @@ -169,7 +182,8 @@ class DocumentViewSet(RetrieveModelMixin, @action(methods=['get'], detail=True) def download(self, request, pk=None): try: - return self.file_response(pk, "attachment") + return self.file_response( + pk, request, "attachment") except FileNotFoundError: raise Http404("Document source file does not exist") From 2d559d330d72800d5859d5eb8c6eff15c1ea4d0a Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:50:43 +0100 Subject: [PATCH 04/42] reworked PDF parser that uses OCRmyPDF and produces archive files. --- Pipfile | 2 +- Pipfile.lock | 292 +++++++++++++++++++++++++++-- docs/configuration.rst | 34 +++- paperless.conf.example | 3 +- src/documents/parsers.py | 17 -- src/paperless/settings.py | 8 +- src/paperless_tesseract/parsers.py | 204 ++++++-------------- 7 files changed, 374 insertions(+), 186 deletions(-) diff --git a/Pipfile b/Pipfile index ad60e0905..079037f15 100644 --- a/Pipfile +++ b/Pipfile @@ -23,7 +23,6 @@ langdetect = "*" pdftotext = "*" pathvalidate = "*" pillow = "*" -pyocr = "~=0.7.2" python-gnupg = "*" python-dotenv = "*" python-dateutil = "*" @@ -35,6 +34,7 @@ scikit-learn="~=0.23.2" whitenoise = "~=5.2.0" watchdog = "*" whoosh="~=2.7.4" +ocrmypdf = "*" [dev-packages] coveralls = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 6ecca3c34..39c35c2d9 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "ae2643b9cf0cf5741ae149fb6bc0c480de41329ce48e773eb4b5d760bc5e2244" + "sha256": "cf1c008df0080c01273c032aef59bd841e4f27b66beaf3fa459665a7a7a4fcc4" }, "pipfile-spec": 6, "requires": {}, @@ -42,6 +42,94 @@ ], "version": "==1.17.11" }, + "cffi": { + "hashes": [ + "sha256:00a1ba5e2e95684448de9b89888ccd02c98d512064b4cb987d48f4b40aa0421e", + "sha256:00e28066507bfc3fe865a31f325c8391a1ac2916219340f87dfad602c3e48e5d", + "sha256:045d792900a75e8b1e1b0ab6787dd733a8190ffcf80e8c8ceb2fb10a29ff238a", + "sha256:0638c3ae1a0edfb77c6765d487fee624d2b1ee1bdfeffc1f0b58c64d149e7eec", + "sha256:105abaf8a6075dc96c1fe5ae7aae073f4696f2905fde6aeada4c9d2926752362", + "sha256:155136b51fd733fa94e1c2ea5211dcd4c8879869008fc811648f16541bf99668", + "sha256:1a465cbe98a7fd391d47dce4b8f7e5b921e6cd805ef421d04f5f66ba8f06086c", + "sha256:1d2c4994f515e5b485fd6d3a73d05526aa0fcf248eb135996b088d25dfa1865b", + "sha256:23f318bf74b170c6e9adb390e8bd282457f6de46c19d03b52f3fd042b5e19654", + "sha256:2c24d61263f511551f740d1a065eb0212db1dbbbbd241db758f5244281590c06", + "sha256:51a8b381b16ddd370178a65360ebe15fbc1c71cf6f584613a7ea08bfad946698", + "sha256:594234691ac0e9b770aee9fcdb8fa02c22e43e5c619456efd0d6c2bf276f3eb2", + "sha256:5cf4be6c304ad0b6602f5c4e90e2f59b47653ac1ed9c662ed379fe48a8f26b0c", + "sha256:64081b3f8f6f3c3de6191ec89d7dc6c86a8a43911f7ecb422c60e90c70be41c7", + "sha256:6bc25fc545a6b3d57b5f8618e59fc13d3a3a68431e8ca5fd4c13241cd70d0009", + "sha256:798caa2a2384b1cbe8a2a139d80734c9db54f9cc155c99d7cc92441a23871c03", + "sha256:7c6b1dece89874d9541fc974917b631406233ea0440d0bdfbb8e03bf39a49b3b", + "sha256:840793c68105fe031f34d6a086eaea153a0cd5c491cde82a74b420edd0a2b909", + "sha256:8d6603078baf4e11edc4168a514c5ce5b3ba6e3e9c374298cb88437957960a53", + "sha256:9cc46bc107224ff5b6d04369e7c595acb700c3613ad7bcf2e2012f62ece80c35", + "sha256:9f7a31251289b2ab6d4012f6e83e58bc3b96bd151f5b5262467f4bb6b34a7c26", + "sha256:9ffb888f19d54a4d4dfd4b3f29bc2c16aa4972f1c2ab9c4ab09b8ab8685b9c2b", + "sha256:a7711edca4dcef1a75257b50a2fbfe92a65187c47dab5a0f1b9b332c5919a3fb", + "sha256:af5c59122a011049aad5dd87424b8e65a80e4a6477419c0c1015f73fb5ea0293", + "sha256:b18e0a9ef57d2b41f5c68beefa32317d286c3d6ac0484efd10d6e07491bb95dd", + "sha256:b4e248d1087abf9f4c10f3c398896c87ce82a9856494a7155823eb45a892395d", + "sha256:ba4e9e0ae13fc41c6b23299545e5ef73055213e466bd107953e4a013a5ddd7e3", + "sha256:be8661bcee1bc2fc4b033a6ab65bd1f87ce5008492601695d0b9a4e820c3bde5", + "sha256:c6332685306b6417a91b1ff9fae889b3ba65c2292d64bd9245c093b1b284809d", + "sha256:d9efd8b7a3ef378dd61a1e77367f1924375befc2eba06168b6ebfa903a5e59ca", + "sha256:df5169c4396adc04f9b0a05f13c074df878b6052430e03f50e68adf3a57aa28d", + "sha256:ebb253464a5d0482b191274f1c8bf00e33f7e0b9c66405fbffc61ed2c839c775", + "sha256:ec80dc47f54e6e9a78181ce05feb71a0353854cc26999db963695f950b5fb375", + "sha256:f032b34669220030f905152045dfa27741ce1a6db3324a5bc0b96b6c7420c87b", + "sha256:f60567825f791c6f8a592f3c6e3bd93dd2934e3f9dac189308426bd76b00ef3b", + "sha256:f803eaa94c2fcda012c047e62bc7a51b0bdabda1cad7a92a522694ea2d76e49f" + ], + "version": "==1.14.4" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "markers": "python_version >= '3.1'", + "version": "==3.0.4" + }, + "coloredlogs": { + "hashes": [ + "sha256:346f58aad6afd48444c2468618623638dadab76e4e70d5e10822676f2d32226a", + "sha256:a1fab193d2053aa6c0a97608c4342d031f1f93a3d1218432c59322441d31a505", + "sha256:b0c2124367d4f72bd739f48e1f61491b4baf145d6bda33b606b4a53cb3f96a97" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==14.0" + }, + "cryptography": { + "hashes": [ + "sha256:07ca431b788249af92764e3be9a488aa1d39a0bc3be313d826bbec690417e538", + "sha256:13b88a0bd044b4eae1ef40e265d006e34dbcde0c2f1e15eb9896501b2d8f6c6f", + "sha256:257dab4f368fae15f378ea9a4d2799bf3696668062de0e9fa0ebb7a738a6917d", + "sha256:32434673d8505b42c0de4de86da8c1620651abd24afe91ae0335597683ed1b77", + "sha256:3cd75a683b15576cfc822c7c5742b3276e50b21a06672dc3a800a2d5da4ecd1b", + "sha256:4e7268a0ca14536fecfdf2b00297d4e407da904718658c1ff1961c713f90fd33", + "sha256:545a8550782dda68f8cdc75a6e3bf252017aa8f75f19f5a9ca940772fc0cb56e", + "sha256:55d0b896631412b6f0c7de56e12eb3e261ac347fbaa5d5e705291a9016e5f8cb", + "sha256:5849d59358547bf789ee7e0d7a9036b2d29e9a4ddf1ce5e06bb45634f995c53e", + "sha256:59f7d4cfea9ef12eb9b14b83d79b432162a0a24a91ddc15c2c9bf76a68d96f2b", + "sha256:6dc59630ecce8c1f558277ceb212c751d6730bd12c80ea96b4ac65637c4f55e7", + "sha256:7117319b44ed1842c617d0a452383a5a052ec6aa726dfbaffa8b94c910444297", + "sha256:75e8e6684cf0034f6bf2a97095cb95f81537b12b36a8fedf06e73050bb171c2d", + "sha256:7b8d9d8d3a9bd240f453342981f765346c87ade811519f98664519696f8e6ab7", + "sha256:a035a10686532b0587d58a606004aa20ad895c60c4d029afa245802347fab57b", + "sha256:a4e27ed0b2504195f855b52052eadcc9795c59909c9d84314c5408687f933fc7", + "sha256:a733671100cd26d816eed39507e585c156e4498293a907029969234e5e634bc4", + "sha256:a75f306a16d9f9afebfbedc41c8c2351d8e61e818ba6b4c40815e2b5740bb6b8", + "sha256:bd717aa029217b8ef94a7d21632a3bb5a4e7218a4513d2521c2a2fd63011e98b", + "sha256:d25cecbac20713a7c3bc544372d42d8eafa89799f492a43b79e1dfd650484851", + "sha256:d26a2557d8f9122f9bf445fc7034242f4375bd4e95ecda007667540270965b13", + "sha256:d3545829ab42a66b84a9aaabf216a4dce7f16dbc76eb69be5c302ed6b8f4a29b", + "sha256:d3d5e10be0cf2a12214ddee45c6bd203dab435e3d83b4560c03066eda600bfe3", + "sha256:efe15aca4f64f3a7ea0c09c87826490e50ed166ce67368a68f315ea0807a20df" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.2.1" + }, "dateparser": { "hashes": [ "sha256:7552c994f893b5cb8fcf103b4cd2ff7f57aab9bfd2619fdf0cf571c0740fd90b", @@ -121,6 +209,14 @@ "index": "pypi", "version": "==20.0.4" }, + "humanfriendly": { + "hashes": [ + "sha256:bf52ec91244819c780341a3438d5d7b09f431d3f113a475147ac9b7b167a3d12", + "sha256:e78960b31198511f45fd455534ae7645a6207d33e512d2e842c766d15d9c8080" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==8.2" + }, "imap-tools": { "hashes": [ "sha256:96e9a4ff6483462635737730a1df28e739faa71967b12a84f4363fb386542246", @@ -129,6 +225,13 @@ "index": "pypi", "version": "==0.32.0" }, + "img2pdf": { + "hashes": [ + "sha256:57905015579b1026acf1605aa95859cd79b051fa1c35485573d165526fc9dbb5", + "sha256:eaee690ab8403dd1a9cb4db10afee41dd3e6c7ed63bdace02a0121f9feadb0c9" + ], + "version": "==0.4.0" + }, "joblib": { "hashes": [ "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72", @@ -146,6 +249,51 @@ "index": "pypi", "version": "==1.0.8" }, + "lxml": { + "hashes": [ + "sha256:098fb713b31050463751dcc694878e1d39f316b86366fb9fe3fbbe5396ac9fab", + "sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b", + "sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5", + "sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301", + "sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b", + "sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d", + "sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b", + "sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9", + "sha256:288ddf94d9d0488187f578fdcc1868af2a6fe6714444c8259b68a83fa27b76d2", + "sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b", + "sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311", + "sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891", + "sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a", + "sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1", + "sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856", + "sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810", + "sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51", + "sha256:4e006fdb434609956a8f710ffffe650afab414dc43728786ebdbdca48e179b14", + "sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360", + "sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4", + "sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f", + "sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230", + "sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a", + "sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f", + "sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174", + "sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf", + "sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd", + "sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3", + "sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a", + "sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5", + "sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367", + "sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c", + "sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1", + "sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8", + "sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f", + "sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc", + "sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d", + "sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9", + "sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==4.6.1" + }, "numpy": { "hashes": [ "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db", @@ -187,6 +335,14 @@ "markers": "python_version >= '3.6'", "version": "==1.19.4" }, + "ocrmypdf": { + "hashes": [ + "sha256:20722d89d2f0deeb5b3ffa8622ead59d54af46d44f21848ec0f15ef79ce1a4a3", + "sha256:c592e1bb37abafd24f067043bbf98d25405521cbe1e992de30d8b870dbe86928" + ], + "index": "pypi", + "version": "==11.3.3" + }, "pathtools": { "hashes": [ "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0", @@ -202,6 +358,14 @@ "index": "pypi", "version": "==2.3.0" }, + "pdfminer.six": { + "hashes": [ + "sha256:b9aac0ebeafb21c08bf65f2039f4b2c5f78a3449d0a41df711d72445649e952a", + "sha256:d78877ba8d8bf957f3bb636c4f73f4f6f30f56c461993877ac22c39c20837509" + ], + "markers": "python_version >= '3.4'", + "version": "==20201018" + }, "pdftotext": { "hashes": [ "sha256:98aeb8b07a4127e1a30223bd933ef080bbd29aa88f801717ca6c5618380b8aa6" @@ -209,6 +373,33 @@ "index": "pypi", "version": "==2.1.5" }, + "pikepdf": { + "hashes": [ + "sha256:0dd42f791f29e7e2ab120103605b9ddd65937c773a72d21341a56873a89e76c9", + "sha256:12a1d243143cf972ce11def50f0bd1f6e630f5e660cdeddb2c7c49db5adad40a", + "sha256:2e1713af11b71e95c2d218c10d68b6f8e813be19c8596c560f3c84617f6d5437", + "sha256:2f90acad26d9939193946eb6ca8363fd3cf44b46b5c1409468906618bccb8113", + "sha256:3c482fe30fd58ff385795605a9233f37f97fb83427c3e829b1a568a2a3b59f60", + "sha256:3ddabfc33a8a7cecba76c1685ce5125fdf239a38d0854d7c2a703490b5783773", + "sha256:61dd3f13b7416111d19bf493ce4e7281f63a1dd22c532200cbbcd65813ea43e4", + "sha256:6ce42b7780835fb52452ccaff3a3ac1b28ae1f9d80faab59c559045d9fcb211d", + "sha256:6dba75782f108ebbf3947fcb29ea0ba7da0482868e53f6602643adc36245201d", + "sha256:716427a5c0372f3cc7dc282c4b49d49d8d5182a3e937739a4c3632151e74d6a4", + "sha256:730ef4013099da7ea722a9b5659260097af6f47ddfa3c2abab4d4493de2591f3", + "sha256:73e14bba4135adfb89ae2f2163369bd788ecf23839acc8d062d832118f07e288", + "sha256:84df07acc8968051da33891af55a3ab1aa55453d83df4ce9b84d821eedc34583", + "sha256:8f739e9c660d71cd479f11f9aa110857cf0d0d9c2472f40bbcbaf02f980355a1", + "sha256:a20ca7adbb9d3da416cf5f6de0ebca53855f9a3b99acdd6ec864c61482894d71", + "sha256:bc58d9486c0959619a2584e558a54d36468c6d1165cd9fe0bfb1ecc3e6b33c6a", + "sha256:c0627930a17b3a5e1a7c9109099535259afc50fe006a05af9c3634de05abd318", + "sha256:de5f445eaaadd7dae56e1043ab8ca5eef49ece302a4e37e1fc6d21b7dcfcfb1b", + "sha256:de6aae7782db33f2cc71c9ba63b7e2ec0e0529843c065eac4e71fcbe043426e2", + "sha256:e2efd844c09f8ce3103a93bfbd54983542a0a63c88bdc0f0cdbb2997f99a147d", + "sha256:fdb481ad1219e8d667625afd2f01b26f98df079e4f66e7e49816ec20c8d8c401" + ], + "markers": "python_version < '3.9'", + "version": "==2.1.2" + }, "pillow": { "hashes": [ "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a", @@ -244,6 +435,14 @@ "index": "pypi", "version": "==8.0.1" }, + "pluggy": { + "hashes": [ + "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", + "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.13.1" + }, "psycopg2-binary": { "hashes": [ "sha256:0deac2af1a587ae12836aa07970f5cb91964f05a7c6cdb69d8425ff4c15d4e2c", @@ -287,13 +486,13 @@ "index": "pypi", "version": "==2.8.6" }, - "pyocr": { + "pycparser": { "hashes": [ - "sha256:fa15adc7e1cf0d345a2990495fe125a947c6e09a60ddba0256a1c14b2e603179", - "sha256:fd602af17b6e21985669aadc058a95f343ff921e962ed4aa6520ded32e4d1301" + "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0", + "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705" ], - "index": "pypi", - "version": "==0.7.2" + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.20" }, "python-dateutil": { "hashes": [ @@ -401,6 +600,53 @@ ], "version": "==2020.11.13" }, + "reportlab": { + "hashes": [ + "sha256:06be7f04a631f02cd0202f7dee0d3e61dc265223f4ff861525ed7784b5552540", + "sha256:0a788a537c48915eda083485b59ac40ac012fa7c43070069bde6eb5ea588313c", + "sha256:1a7a38810e79653d0ea8e61db4f0517ac2a0e76edd2497cf6d4969dd3be30030", + "sha256:22301773db730545b44d4c77d8f29baf5683ccabec9883d978e8b8eda6d2175f", + "sha256:2906321b3d2779faafe47e2c13f9c69e1fb4ddb907f5a49cab3f9b0ea95df1f5", + "sha256:2d65f9cc5c0d3f63b5d024e6cf92234f1ab1f267cc9e5a847ab5d3efe1c3cf3e", + "sha256:2e012f7b845ef9f1f5bd63461d5201fa624b019a65ff5a93d0002b4f915bbc89", + "sha256:31ccfdbf5bb5ec85f0397661085ce4c9e52537ca0d2bf4220259666a4dcc55c2", + "sha256:3e10bd20c8ada9f7e1113157aa73b8e0048f2624e74794b73799c3deb13d7a3f", + "sha256:440d5f86c2b822abdb7981d691a78bdcf56f4710174830283034235ab2af2969", + "sha256:4f307accda32c9f17015ed77c7424f904514e349dff063f78d2462d715963e53", + "sha256:59659ee8897950fd1acd41a9cc61f4afdfda52dc2bb69a1924ce68089491849d", + "sha256:6216b11313467989ac9d9578ea3756d0af46e97184ee4e11a6b7ef652458f70d", + "sha256:6268a9a3d75e714b22beeb7687270956b06b232ccfdf37b1c6462961eab04457", + "sha256:6b226830f80df066d5986a3fdb3eb4d1b6320048f3d9ade539a6c03a5bc8b3ec", + "sha256:6e10eba6a0e330096f4200b18824b3194c399329b7830e34baee1c04ea07f99f", + "sha256:6e224c16c3d6fafdb2fb67b33c4b84d984ec34869834b3a137809f2fe5b84778", + "sha256:7da162fa677b90bd14f19b20ff80fec18c24a31ac44e5342ba49e198b13c4f92", + "sha256:8406e960a974a65b765c9ff74b269aa64718b4af1e8c511ebdbd9a5b44b0c7e6", + "sha256:8999bb075102d1b8ca4aada6ca14653d52bf02e37fd064e477eb180741f75077", + "sha256:8ae21aa94e405bf5171718f11ebc702a0edf18c91d88b14c5c5724cabd664673", + "sha256:8f6163729612e815b89649aed2e237505362a78014199f819fd92f9e5c96769b", + "sha256:9699fa8f0911ad56b46cc60bbaebe1557fd1c9e8da98185a7a1c0c40193eba48", + "sha256:9a53d76eec33abda11617aad1c9f5f4a2d906dd2f92a03a3f1ea370efbb52c95", + "sha256:9ed4d761b726ff411565eddb10cb37a6bca0ec873d9a18a83cf078f4502a2d94", + "sha256:a020d308e7c2de284d5407e3c6c13e3977a62b314f7bfe19bcc69677931da589", + "sha256:a2e6c15aecbe631245aab639751a58671312cced7e17de1ed9c45fb37036f6c9", + "sha256:b10cb48606d97b70edb094576e3d493d40467395e4fc267655135a2c92defbe8", + "sha256:b8d6e9df5181ed07b7ae145258eb69e686133afc97930af51a3c0c9d784d834d", + "sha256:bbb297754f5cf25eb8fcb817752984252a7feb0ca83e383718e4eec2fb67ea32", + "sha256:be90599e5e78c1ddfcfee8c752108def58b4c672ebcc4d3d9aa7fe65e7d3f16b", + "sha256:bfdfad9b8ae00bd0752b77f954c7405327fd99b2cc6d5e4273e65be61429d56a", + "sha256:c1e5ef5089e16b249388f65d8c8f8b74989e72eb8332060dc580a2ecb967cfc2", + "sha256:c5ed342e29a5fd7eeb0f2ccf7e5b946b5f750f05633b2d6a94b1c02094a77967", + "sha256:c7087a26b26aa82a3ba27e13e66f507cc697f9ceb4c046c0f758876b55f040a5", + "sha256:cf589e980d92b0bf343fa512b9d3ae9ed0469cbffd99cb270b6c83da143cb437", + "sha256:e6fb762e524a4fb118be9f44dbd9456cf80e42253ee8f1bdb0ea5c1f882d4ba8", + "sha256:e961d3a84c65ca030963ca934a4faad2ac9fee75af36ba2f98733da7d3f7efab", + "sha256:f2fde5abb6f21c1eff5430f380cdbbee7fdeda6af935a83730ddce9f0c4e504e", + "sha256:f585b3bf7062c228306acd7f40b2ad915b32603228c19bb225952cc98fd2015a", + "sha256:f955a6366cf8e6729776c96e281bede468acd74f6eb49a5bbb048646adaa43d8", + "sha256:fe882fd348d8429debbdac4518d6a42888a7f4ad613dc596ce94788169caeb08" + ], + "version": "==3.5.55" + }, "scikit-learn": { "hashes": [ "sha256:090bbf144fd5823c1f2efa3e1a9bf180295b24294ca8f478e75b40ed54f8036e", @@ -464,6 +710,13 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.15.0" }, + "sortedcontainers": { + "hashes": [ + "sha256:37257a32add0a3ee490bb170b599e93095eed89a55da91fa9f48753ea12fd73f", + "sha256:59cc937650cf60d677c16775597c89a960658a09cf7c1a668f86e1e4464b10a1" + ], + "version": "==2.3.0" + }, "sqlparse": { "hashes": [ "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0", @@ -480,6 +733,14 @@ "markers": "python_version >= '3.5'", "version": "==2.1.0" }, + "tqdm": { + "hashes": [ + "sha256:3d3f1470d26642e88bd3f73353cb6ff4c51ef7d5d7efef763238f4bc1f7e4e81", + "sha256:5ff3f5232b19fa4c5531641e480b7fad4598819f708a32eb815e6ea41c5fa313" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==4.53.0" + }, "tzlocal": { "hashes": [ "sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44", @@ -489,11 +750,11 @@ }, "watchdog": { "hashes": [ - "sha256:034c85530b647486e8c8477410fe79476511282658f2ce496f97106d9e5acfb8", - "sha256:4214e1379d128b0588021880ccaf40317ee156d4603ac388b9adcf29165e0c04" + "sha256:3caefdcc8f06a57fdc5ef2d22aa7c0bfda4f55e71a0bee74cbf3176d97536ef3", + "sha256:e38bffc89b15bafe2a131f0e1c74924cf07dcec020c2e0a26cccd208831fcd43" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.10.4" }, "wcwidth": { "hashes": [ @@ -571,6 +832,7 @@ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" ], + "markers": "python_version >= '3.1'", "version": "==3.0.4" }, "coverage": { @@ -663,11 +925,11 @@ }, "faker": { "hashes": [ - "sha256:3f5d379e4b5ce92a8afe3c2ce59d7c43886370dd3bf9495a936b91888debfc81", - "sha256:8c0e8a06acef4b9312902e2ce18becabe62badd3a6632180bd0680c6ee111473" + "sha256:5398268e1d751ffdb3ed36b8a790ed98659200599b368eec38a02eed15bce997", + "sha256:d4183b8f57316de3be27cd6c3b40e9f9343d27c95c96179f027316c58c2c239e" ], "markers": "python_version >= '3.5'", - "version": "==4.17.0" + "version": "==4.17.1" }, "filelock": { "hashes": [ @@ -999,11 +1261,11 @@ }, "virtualenv": { "hashes": [ - "sha256:b0011228208944ce71052987437d3843e05690b2f23d1c7da4263fde104c97a2", - "sha256:b8d6110f493af256a40d65e29846c69340a947669eec8ce784fcf3dd3af28380" + "sha256:07cff122e9d343140366055f31be4dcd61fd598c69d11cd33a9d9c8df4546dd7", + "sha256:e0aac7525e880a429764cefd3aaaff54afb5d9f25c82627563603f5d7de5a6e5" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", - "version": "==20.1.0" + "version": "==20.2.1" } } } diff --git a/docs/configuration.rst b/docs/configuration.rst index c3f01c2ca..ad1c7c117 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -218,11 +218,37 @@ PAPERLESS_OCR_LANGUAGE= Defaults to "eng". -PAPERLESS_OCR_ALWAYS= - By default Paperless does not OCR a document if the text can be retrieved from - the document directly. Set to true to always OCR documents. +PAPERLESS_OCR_MODE= + Tell paperless when and how to perform ocr on your documents. Three modes + are available: - Defaults to false. + * ``skip``: Paperless skips all pages and will perform ocr only on pages + where no text is present. This is the safest and fastest option. + * ``redo``: Paperless will OCR all pages of your documents and attempt to + replace any existing text layers with new text. This will be useful for + documents from scanners that already performed OCR with insufficient + results. It will also perform OCR on purely digital documents. + + This option may fail on some documents that have features that cannot + be removed, such as forms. In this case, the text from the document is + used instead. + * ``force``: Paperless rasterizes your documents, converting any text + into images and puts the OCRed text on top. This works for all documents, + however, the resulting document may be significantly larger and text + won't appear as sharp when zoomed in. + + The default is ``skip``, which only performs OCR when necessary. + +PAPERLESS_OCR_OUTPUT_TYPE= + Specify the the type of PDF documents that paperless should produce. + + * ``pdf``: Modify the PDF document as little as possible. + * ``pdfa``: Convert PDF documents into PDF/A documents, which is a + subset of the entire PDF specification and meant for storing + documents long term. + + If not specified, ``pdfa`` is used. Remember that paperless also keeps + the original input file as well as the archived version. PAPERLESS_CONSUMER_POLLING= If paperless won't find documents added to your consume folder, it might diff --git a/paperless.conf.example b/paperless.conf.example index 4962c1567..34e560507 100644 --- a/paperless.conf.example +++ b/paperless.conf.example @@ -38,7 +38,8 @@ #PAPERLESS_TIME_ZONE=UTC #PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_LANGUAGE=eng -#PAPERLESS_OCR_ALWAYS=false +#PAPERLESS_OCR_OUTPUT_TYPE=pdfa +#PAPERLESS_OCR_MODE=skip #PAPERLESS_CONSUMER_POLLING=10 #PAPERLESS_CONSUMER_DELETE_DUPLICATES=false #PAPERLESS_CONVERT_MEMORY_LIMIT=0 diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 3ad60dccd..542a5dae9 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -107,23 +107,6 @@ def run_convert(input_file, raise ParseError("Convert failed at {}".format(args)) -def run_unpaper(pnm, logging_group=None): - pnm_out = pnm.replace(".pnm", ".unpaper.pnm") - - command_args = (settings.UNPAPER_BINARY, "--overwrite", "--quiet", pnm, - pnm_out) - - logger.debug(f"Execute: {' '.join(command_args)}", - extra={'group': logging_group}) - - if not subprocess.Popen(command_args, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL).wait() == 0: - raise ParseError(f"Unpaper failed at {command_args}") - - return pnm_out - - class ParseError(Exception): pass diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 66f9fee4b..5cede45c4 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -338,9 +338,13 @@ OCR_PAGES = int(os.getenv('PAPERLESS_OCR_PAGES', 0)) # documents. It should be a 3-letter language code consistent with ISO 639. OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") +# OCRmyPDF --output-type options are available. +# TODO: validate this setting. +OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa") -# OCR all documents? -OCR_ALWAYS = __get_boolean("PAPERLESS_OCR_ALWAYS", "false") +# skip. redo, force +# TODO: validate this. +OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b8320a4f0..8f694ef56 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -1,23 +1,14 @@ -import itertools import os import re import subprocess -from multiprocessing.pool import ThreadPool import langdetect +import ocrmypdf import pdftotext -import pyocr -from PIL import Image from django.conf import settings -from pyocr import PyocrException +from ocrmypdf import InputFileError -from documents.parsers import DocumentParser, ParseError, run_unpaper, \ - run_convert -from .languages import ISO639 - - -class OCRError(Exception): - pass +from documents.parsers import DocumentParser, ParseError, run_convert class RasterisedDocumentParser(DocumentParser): @@ -29,6 +20,7 @@ class RasterisedDocumentParser(DocumentParser): def __init__(self, path, logging_group): super().__init__(path, logging_group) self._text = None + self._archive_path = None def get_thumbnail(self): """ @@ -74,113 +66,67 @@ class RasterisedDocumentParser(DocumentParser): return out_path - def _is_ocred(self): - - # Extract text from PDF using pdftotext - text = get_text_from_pdf(self.document_path) - - # We assume, that a PDF with at least 50 characters contains text - # (so no OCR required) - return len(text) > 50 - def get_text(self): - if self._text is not None: + if self._text: return self._text - if not settings.OCR_ALWAYS and self._is_ocred(): - self.log("debug", "Skipping OCR, using Text from PDF") - self._text = get_text_from_pdf(self.document_path) - return self._text + archive_path = os.path.join(self.tempdir, "archive.pdf") - images = self._get_greyscale() + ocr_args = { + 'input_file': self.document_path, + 'output_file': archive_path, + 'use_threads': True, + 'jobs': settings.THREADS_PER_WORKER, + 'language': settings.OCR_LANGUAGE, + 'output_type': settings.OCR_OUTPUT_TYPE, + 'progress_bar': False, + 'clean': True + } - if not images: - raise ParseError("Empty document, nothing to do.") + if settings.OCR_PAGES > 0: + ocr_args['pages'] = f"1-{settings.OCR_PAGES}" + + if settings.OCR_MODE == 'skip': + ocr_args['skip_text'] = True + elif settings.OCR_MODE == 'redo': + ocr_args['redo_ocr'] = True + elif settings.OCR_MODE == 'force': + ocr_args['force_ocr'] = True try: + ocrmypdf.ocr(**ocr_args) + # success! announce that we have an archive document + self._archive_path = archive_path + self._text = get_text_from_pdf(self._archive_path) - sample_page_index = int(len(images) / 2) - self.log( - "debug", - f"Attempting language detection on page " - f"{sample_page_index + 1} of {len(images)}...") + except InputFileError as e: + # This happens with some PDFs when used with the redo_ocr option. + # This is not the end of the world, we'll just use what we already + # have in the document. + self._text = get_text_from_pdf(self.document_path) + # Also, no archived file. + if not self._text: + # However, if we don't have anything, fail: + raise ParseError(e) - sample_page_text = self._ocr([images[sample_page_index]], - settings.OCR_LANGUAGE)[0] - guessed_language = self._guess_language(sample_page_text) - - if not guessed_language or guessed_language not in ISO639: - self.log("warning", "Language detection failed.") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - elif ISO639[guessed_language] == settings.OCR_LANGUAGE: - self.log( - "debug", - f"Detected language: {guessed_language} " - f"(default language)") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501 - self.log( - "warning", - f"Detected language {guessed_language} is not available " - f"on this system.") - ocr_pages = self._complete_ocr_default_language( - images, sample_page_index, sample_page_text) - - else: - self.log("debug", f"Detected language: {guessed_language}") - ocr_pages = self._ocr(images, ISO639[guessed_language]) - - self.log("debug", "OCR completed.") - self._text = strip_excess_whitespace(" ".join(ocr_pages)) - return self._text - - except OCRError as e: + except Exception as e: + # Anything else is probably serious. raise ParseError(e) - def _get_greyscale(self): - """ - Greyscale images are easier for Tesseract to OCR - """ + if not self._text: + # This may happen for files that don't have any text. + self.log( + 'warning', + f"Document {self.document_path} does not have any text." + f"This is probably an error or you tried to add an image " + f"without text.") + return "" - # Convert PDF to multiple PNMs - input_file = self.document_path + return self._text - if settings.OCR_PAGES == 1: - input_file += "[0]" - elif settings.OCR_PAGES > 1: - input_file += f"[0-{settings.OCR_PAGES - 1}]" - - self.log( - "debug", - f"Converting document {input_file} into greyscale images") - - output_files = os.path.join(self.tempdir, "convert-%04d.pnm") - - run_convert(density=settings.CONVERT_DENSITY, - depth="8", - type="grayscale", - input_file=input_file, - output_file=output_files, - logging_group=self.logging_group) - - # Get a list of converted images - pnms = [] - for f in os.listdir(self.tempdir): - if f.endswith(".pnm"): - pnms.append(os.path.join(self.tempdir, f)) - - self.log("debug", f"Running unpaper on {len(pnms)} pages...") - - # Run unpaper in parallel on converted images - with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: - pnms = pool.map(run_unpaper, pnms) - - return sorted(filter(lambda __: os.path.isfile(__), pnms)) + def get_archive_path(self): + return self._archive_path def _guess_language(self, text): try: @@ -190,30 +136,11 @@ class RasterisedDocumentParser(DocumentParser): self.log('warning', f"Language detection failed with: {e}") return None - def _ocr(self, imgs, lang): - self.log( - "debug", - f"Performing OCR on {len(imgs)} page(s) with language {lang}") - with ThreadPool(processes=settings.THREADS_PER_WORKER) as pool: - r = pool.map(image_to_string, itertools.product(imgs, [lang])) - return r - - def _complete_ocr_default_language(self, - images, - sample_page_index, - sample_page): - images_copy = list(images) - del images_copy[sample_page_index] - if images_copy: - self.log('debug', "Continuing ocr with default language.") - ocr_pages = self._ocr(images_copy, settings.OCR_LANGUAGE) - ocr_pages.insert(sample_page_index, sample_page) - return ocr_pages - else: - return [sample_page] - def strip_excess_whitespace(text): + if not text: + return None + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) no_leading_whitespace = re.sub( r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) @@ -222,29 +149,14 @@ def strip_excess_whitespace(text): return no_trailing_whitespace -def image_to_string(args): - img, lang = args - ocr = pyocr.get_available_tools()[0] - with Image.open(img) as f: - if ocr.can_detect_orientation(): - try: - orientation = ocr.detect_orientation(f, lang=lang) - f = f.rotate(orientation["angle"], expand=1) - except Exception: - # Rotation not possible, ignore - pass - try: - return ocr.image_to_string(f, lang=lang) - except PyocrException as e: - raise OCRError(e) - - def get_text_from_pdf(pdf_file): with open(pdf_file, "rb") as f: try: pdf = pdftotext.PDF(f) except pdftotext.Error: - return "" + return None - return "\n".join(pdf) + text = "\n".join(pdf) + + return strip_excess_whitespace(text) From ef6690905e0ef516f2196daf91cd4197c1dcdf0f Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:51:00 +0100 Subject: [PATCH 05/42] todo note. --- src/documents/file_handling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index ee7e9b761..179c97492 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -10,6 +10,7 @@ def create_source_path_directory(source_path): os.makedirs(os.path.dirname(source_path), exist_ok=True) +# TODO: also make this work for archive dir def delete_empty_directories(directory): # Go up in the directory hierarchy and try to delete all directories directory = os.path.normpath(directory) From 56ce267f894d3486a97c43c6cda24dcf226e9d4f Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 14:51:32 +0100 Subject: [PATCH 06/42] removed obsolete tests. --- .../tests/samples/no-text.png | Bin 32595 -> 0 bytes src/paperless_tesseract/tests/test_ocr.py | 44 +----- src/paperless_tesseract/tests/test_parser.py | 149 +----------------- 3 files changed, 3 insertions(+), 190 deletions(-) delete mode 100644 src/paperless_tesseract/tests/samples/no-text.png diff --git a/src/paperless_tesseract/tests/samples/no-text.png b/src/paperless_tesseract/tests/samples/no-text.png deleted file mode 100644 index e78b22bfbe53be5a9046bbfb32fef7e98abb9be9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32595 zcmaI7b9AIZw>O%JZQEAI$pjPIHafPAiEZ2F#I|iuJh46D&AjKF@4J88b9?pbr>fT8 zyEcBctGcS5a0NMW1Xx^H5D*XqNeK}p5Rh*)U*%nB$gkfdRpG0z2adC-hO@Grsk57* zqX~$hv7M0#v80WmnTe8#p|OXtPBs(&W6$O9}T0sjr|ul2ner$ zyS*XM%EX!2$i&RTmXGwRqlc8(!kCX#on4k$)?V1e+(N?B(L~u(P6g;`1>`m+72qf4 zb?5mKU}NHJNbGK7ZR^D2&PVzmxjbLxf6Yv!#Q!02w&ElGFQqhO6^MoH98HMX8QB?t z%q#$6PHsj3I~z9uKu^rV3}9sfFf*}mGXMZQY@9p*4&wj*kbd>%Xl%-(BqH|TeSOvV zNX?y{?Rl7(+}zw4-B=mz9L<;j+}zwu%q&bSEDT>13{D=l&W7#`woYXKBSFN(3Fv5H z?`&abOZ-owp^=@7Gau=frvGJvjs5=-YwPsiZu)W*9zm|E5lXkq)WPV*lISy>)QTPJ5jTcC-g2p{Q}2u2GFV;*5vF;*dV4t5bS zVKD$elvRk6MO2stz{MiO$t@zt$@M?HB6dI*8xvdS|KTx4{pCI z{|EUdwqI~}{KB;{o)kFBh@gu5`nfKIx5~&0pM#ql1!LTg*&uM#O@>x!2L`21 z4ehG0uR;Wdn;Sv`2uu*No7RbnkmmOC7vSy zdCAXO{@0hFp%&t@Uk4^w7WEY&a4#`80a*d?Z@Cb=Et-DB6r_By-dj<~s-PRd zf9l;djY`Hy%K-<;hj!TKF&5NggNu?{dwn_O1o{ul{pO~cf{Ica_(t};Lbx^{wh;t1 z6H*(BG%Js3j2?$2&Ck|^j=QTj&pk4;C4gUM{8N%k&?6x=v4}?7%{(O6btk_w5`g5hD|Y#DjfTQA@@98ABu#{T&`H#4t2A+?7X6kdO|XAWHSy$F?`d|^ss zp0JK@{67SVc~yNkq*xa`%LIi-i01d)jD+1o8@%>>f0?bg%>L;=Bc^bB@CV%@0x`Q9 zaJ3R1%`fhtNaJnycVS9t-nF6YzvQ5Omu#40Q6IS#F!Ni4VI*ja?rijK_lS^Cbg)jX z{}-#HgaXKfRDjYgsz#?1Bzp26fm2>i`K=?HkN!v6f2i?h444ODPeOyw6ae9nJ0x<+ zS>eAzxx|v?WJ0DnMc8Pd|LFPokofYrDwN^r#H0%ELw; z*CmZ-1b6m7o&^?D96G0=_kP_O3VK_}He|fk>h!0g~6g5r33SQNuTLHbkfw86J*~3E9n<5 z|Hh9w{9!`L#j!0|OHQ@G3%M63zVK)RbmOsQEQa57Kvkzqw2v`{3AJQ-z3%kCz#=3U zP#e-Gsl|zu=3E%sn%yfNwK<*>PM^0%#BABzEX15NU8?IzS+Ni&UwB|{(`gL+T^SLY zPvTL4k-Fc%o!I981oxxohklJwb3s+nllwMr)NpJ{ym3LV@gEDyaHOIL z)-$;Z;{*k4eKL<+&L%k9{M`CG5;ZK2tL+#2Z*ZL^7nC6_;B#Z(xIa2juvAGD&it}n zcO9LWFnZfGRNz#pblNU@cJ6E&G55f#r6Q6uy#o0Mq(!IDH!6^fZawSoHZ3spYDR+1?nC^^$NJiyEr9g#NJKkMR|be+sQoW& z^rOG%Tg75d5@8@X?@W%34}7?Y1FS9e#YBs!85#Za+ia^xbOdfd@@L3S4iQO3MeB5D z{(H#4PEMXf65NlB`^|b8V@ZU;-O6aD8({9t z_L}3bxk6s$`2Ngk9eCoyCF{W0+qmbu=7lZyUU|oPo~ovF@xFvqQ$72KV0z zakZ@ha%YaavP$gJrMm}~&&ERc*E;?)z4h-%W`q?ta1Rc0X*KJxUSSF#M#qbex4bD!a*q@I}iT6k3qg%*mxp6EZ z7ad~G;BY7MqSN?-rx5n$iC^;+Lu-)=s1qFUBQzmlPovI(<%gE1Bc+toM0|P%%r=yr z!|OemwProkf?AY+PCiG2Uz{Jc;)UG6+HVvzGlbAE=%Xynwlz?2QNas+9gX@H|9{}M zq@Z!20dO_%mf4h#;WX7umRr)Y)@rp^$h>Utvc4GXzx`hAxxvQ#;6SML-r9Q^^TOtH zW2V2Dd@f2=*ZKL#r?t$bFQUMfTf4f^yXOVf(f$N6tLj9Kkv{1>*+@K)Yzcb{hwBnU z#3}Ee(M`+2_yXp}#6O(Xl=uuj6sM=fqvf?Mvm|t(#bn!c_@3Ush43%s1r@Ghz$vH3_}5z&r+T_eux-A!pm@TwcbZWgTz_;zr?bt~=RcbahJk;De|81uDGIrWRo6N^vfGfFTr0;+og z=bBH-h%l)eU2dJ=c0Rd~eL~&-^rYsbNG^6}xV)IB zKLnF;5TClxJ6E&q&aUCc z#-7aH;3j4zgJV%uYC^|A-;K>h+Fw)E=W1>lSf-;hyfB{r^nR@mZ0$V4)aA*NT!(!` z{F>fRKe0%{9g9gVH45~S8yhDKXertdEu#qsc$u3fP$zZx5Z663l1CogM$S6ya0<#{ zGscMbeBfoO5G=b6*TDo>DxDPbe}y4Ni?41_gY){u`f=tCr}xz6eI8tbR_r`d_z7=| zKt*!Q{BTsN01;GdeKZ%9jIi~rb4&UH@HQujVoE-weMZI4nlo*CZ3F^YL1jeNm8gOA z_m!Q!Euo)z)XoTO`5Q8D#T3QtHFtfSRdNXhaY?A+tlGd5m%|i;Qpg(uf%oXo=Il^i zD>`p?f{44<8sO*wf1N9{xVH>xXGr@pS85X5cDZtT*Th#YdFhr2uQ^(%OSUa)%g-=t zt_nbBW5dWRBcE^&k(Le}fSCyq?K+A&*P+UFm&*fS3#Rx(+3)|Qq1 zyuFRk%I2{Gqo}$#@*6Hd!K*V`6f?&yGyDi4BChRKq%l!73C>}qj3R1I9Yg%L4q*2& zDli%%o)2*x3;%mjA;QAKrWD#1&OtxrVqjVTgCHcN>Tvb=T|}%@NJ|nnkkZAB91p`p zRLB5Ts8-Y|ew+IS+X=EJs33$FeF;6Uxb~YD67S*LznLWlMz4Rd0WQS3 zd8qpdyBu{*v+wVO5Z4^~b;8+So|a9`I_Mn($Yqv5$8$|qH_rC=Vb|C&7*^GW1l{Wl zQke;>$iVL@1q|Lj??Pc$(C&{D_;I<^o7H3B|8^j(*2!G)l+8I4%ygYHkeDXQP4h2{ z)4*^23t3OgQfIjYpA)#bGgs+S=o?uQLrbqSdHo>^U8tsXaQFxAcWR1;pT`-()y&;y zcb*~3qCJKm-(P#4?{8m|x2k}@0)YuF*OH1kIfF}>x5#?FJj2i+-xKyc$E=g|gcS08 z-H#qZIR#~P5e*sK`tea=MhXa3ADltdgnEElXq1QvVrk1az-&^osD%f2Kbdl!F@$&* z98#1RRjq52p0=D^NPN3YG#hhKEnKha5DV*_{r=W>Uhg-;o~Uhq;M9!4&S1_l&tP?5 zkF>NrSle1#@}Nl>qjQfBUd&BBok9sM$-4t~)|*;V^qHJSxwWCmHWcj5No0%ah0}VJ8rfCo?u7IIly)-PWcJTEC^99vtz`1T z64Nw~&X4n>Yvwia@=Zfe7<1~kwV{!XNCRV--kO`&x06!oBEPGK*l%A;uOKuS-Fbb= zQpQMK^<|cHkmK9eU|~)N(;1T@QIqz>8J}`DPLDW5_Px&iu?Y@E2b8~$H7VEv^<;ZW zj&W49h~ijqAHER56z)&7_;Q%*LNj-}E^P2^-Ll_EZt85NepL&!?ZPQXzKn^esGtXn z%0L*ZBqasJ7Y`0WYmWUW5`wJ$t}#$tj8n7e9$h#>1hlP9zRvj#MP~Md#Wi#1Wmji% zX-i3GCh7s_t;|YU6faF9|al;8%%P8TLfDf+1XTkIq~pK-NRX&Ju__ z^6#H*f(k<(Ir;vv<$%1ZQ0jKMVf(9so3(By5|TV$q6owX$)K`a*r`33M68rN*uAU@ z0O*J%F)Be)se{h24qc)eAwMc2sAXCW{^yk?OI`t;#4tRKtqre%;21CJ(YE^Z-2pXW zkDB(dodntO8v^s;UpbT-x6CYW6eO?eM z`B!G8rSopOx|d#W?DVr6x20J%!;HudFiHlT5W11e-Pz&9QJTq-*>agOV%J)}2+pVN z{Ou@PY*ssB!q;pEpM$K@imMK8dPnxM9RNfpf;psV~f%< zi;C1~0Q~mSJwRpCbM2u%ku)*9lUq5iRGebA;J#73IO@Z(6iW$~31u7z`lAU?sX~mUX%2 zu0!|biE?15=yvY`2|VRC|Gu+5eY6F6QJ(@LQvI6@E*& z0~0bNt|J*~gqDC=QE#t#*6K7@j&k#>Kf#NBQr@K?_^{8*ci>G`P8`Yn&F>MxQ7K@S zRe9p=wza$u6uGD=YCHcG+ABYbXQ=%|#h?4;#) zgF|3$qIb@P+++ zt+zVI<>sJW`lgp6f|Nk2@3QOOx6IkZ7vvU^t(SQ8{IL;k(QI8?KH1C`#^N7lwV=-! z-o6yTo`8$$E8&EKRtd@D>9vuj%eeG-ZP|L%(C%Nf!w7DHSE1lsT(~P-!?DN;a3tV= z#39E6ip32KO9G=SLxiNe4jVL+)Ge;358{(JLW^K;VbesMA4sC(l7f=3WvpN{ccs-d zgQ_B=PE}sZGKKtvimEp<2aY<4rU^R7KC@m@-`@>bhpVw+J7UyUo325WdD&>x>LeyT zg%kkayj>hhin0bt5tNtuU^5)qc3!aN8Gpt0Q=y?_{Ib}EYk0g&N#h;Y-MMRhW_wAH zT~}>x>1IlJiG`Jpt>vqA35{FXeY*$4`!X>e@RP-w*s#hSe^ZT+zqv5geD4YB1k)O^*5tGH9@<)bxkAJPs-8Ly<8_f{Lgu zKW&&6jY>s94UEpl>ZGKef?Dd(u52mxh#X-V6S~XK_lsls)lYXa@?!E|&XZ!pUb#jC z=eq6=e-YTCROT!sye2nzaT@%x!v^>_98NE6;WIW8FAs-Z{utaifQ#C(xo|WCT9*6Y zEPh+bHD}$=^NU2#ntD|^Ab(DBMh;s!lSU6mE5rw3QnZTO$`~=w>XULGYK`875wdMn ziN0p=3tGV7C@Ai%`cJ<(K`=7N?iv)v<`v;%OR3f5B%_4W_j0OG3li|{2hy$R0q5TM z&26>4=G)YLQD#%h?|(&7ohCJQ=1usgX}ovJsna9CFXC<5VGL1}Q8NUWb55AV(sY^mo|D}u=HYO7(o~EZ++^{`A+w0x-eTLr8g-oFNhxin zaeJbb?FK<2){&EC8`~7TIxz*ag5u-txQDIH>8yNE$$n`XNpxZ&p<;;qvvG6_ zLrYIC^)v68e9R%%prO%sv$d4v0Tx(p`2s47chHwuR&@Zy(U<>g#4y}t>a zjfutTDUFblPLdXKZ6cm$9A1lxYDPiuo;TG8bUlAwYF)Hj#YZ#n!({=v`d>9#wJNMDnM)JQl>+L2L=ZiYqx%0Xr+h*Srl-MsH4nsDoxM zla=N{_k4OmS=^jx{NBL#Az)Wj8FJDQQ^O60aPdkA?TY=`S%5wn#oryFBC5x)_>In~ zD00?OW0Tm>i;j7`INpV6s4*I3?&aILnNiwIti7)U5z00ERErLe3Xc+xMC4$9EUY1#sbMxP zQ$8R>ua6ZjETXm~$Yr7+i{V~qhiekPiOZhKb+=i~c^xKKjdm|{2sZugeTtUizd@pT zUBB`}+bKlFT&Y8F2{B#%$;b1N;;~8~WH61*{KRtSxQ6w3X&_wE$H;bKZ2v%-oSwi# z(p+^yRPNVFtQ9njWbgW)!2A;WmAtFdGooO!(%gdL8eX*r=8y#R5#rjyQIe1_;+WXT z6n~!j`G%XzC8$kuT9V`p%Q%%ET<28+13_bCv^C8%N))-C(n5>QkG@Wy49Z@$Ja6Ga z#a0@UNqh4~LQD=jE&J5dK$VH=^TJu`fO^wKUefv)oQn`sN|y~HCt6u>O6x0jl7u}} z3=D{7R?DGR6%owv{)H(}TE|JSC#<%$b=AjWw}fdqBze1zs2)!I0l!zKAK_)L!Y^&a z9Xdghez&KZ%vx!d&_#cX^M=QAIdjD|G>i(NPnQFKyfvSILVmY2^n}v&M~p|o3hr#= zA{DU5=asF7kCP)WSLeTC_WjEo)J=8J{7N5l)zRynBMD9a;=`LIFi)r&SZZF-!Fx$h zPxa#2qu$F*`~Gsia|@_6@93VXWR1iDNZ8vW0ai**3+v&;gKA+H*3to=OT;RCbcoOh z6%}PDiM3^I=EBaS?tb)i`gX@&v3-%83(fnHi@gE4$UsN7j~v3smk ztttG(l7^Me8?!_!S58GF165HC#H>7~@N;ZJs&843$S&900H4!BP1*AM#ME?PL=Gv( z)HVWv6T)g^y~vXz(zn9DwN9Tx=if~Mb~Nno@gGERa*WOy%-N@{@kQ@XQM$;DZe5=y)_QT*^q z^Z0nN$>ETcZKuGKbfi%_+RPtY8450DXjTkSxce|TERfa@mh#c4D2WjWnM}y#f`pZ+ z@Ie9A@AHkZc3dCpesHiN$yzHL@awQmp>5Ap2PA+>WK!R`ut*u>gt0lU?lPpTyki`J zSX0aE6Py!bCQc%59FJM%x&Dcj_)*ERf%t*g{PN1M!>Q=G+E7;b(@{vR+;u3DUDgX- z`s)`_blWV?pL%3GfSjAoEG4$Q=k*&ecjqyv1y9DIJ(~ps>-<<@?U|h$9ABL7V3*ZG z(}h-kA>1CHL#VCn&ML@KYAaP<8!7N1hR>xG>7s#NhhP4I4J0X3R}35fR~1 z`|o3v&9Bg5Dk>`8Cz-OEKYv{^=3d66sNu7!j7C0%#Y_#xbSOq8iBcfHVuoCwn}^2O zFq|ULw0V3IJQB1j0d<+h=YNc zo~!*u3}`>C#hC@!3e8JDFdh-r(kkypWmqQ>%*L5`RuvoOa$9WE=eZ##Er{Cl!L@%7 z#WuQ~@?ojNIp=Eh6Yg)cw8pxHX99YUT2hSXA4?s>U}&BU8j^+|NS`l*g3{_Rk)}@- z>bM203psw5q+Ha5`y;J)%2kUN=yx!~ePl=qY4Nc*mm$aIS}oc_j3h}2oTT|@#t|@1 zp@yH!KzGLzihA{ff(c?y5#KTyCSl+a`Vs~)6VnXS>R4H2MTGs?dWZ!@vB}qN5e%We zpMS7)OkZ&}cS+O*L>$WICo1C1R$t;HQGQ5{QYtlJ!({ zWHhi47BJ*ytllC^eCE6b;_;hFwhif8l~< zIP5RW`dhD9o8(YRDMcmgL`g__s!B)&(e*tvDF@Pe*gx{e40Te30)SHLT(CIsGy0;) zCnpSVIiZCf(rDJ(yd`4X_owdfjF95;e0^S@T#X#6eS`j(3dtz7sd>w~u>opy(os1a zaUQi#3QYJ@T!Ghl*lcz{*~LykK*0|OntKFiOFpWp*^{-6WQ)aC-@mIilNp~#lrc{~ zN)eY4jA?f5l-)w>^2qUjdvp&!p{~{@V+80287^nQ7z-)X8ZF9-agyX5s}euK9EdPH zI=i@KV3YJtFP~!1DWSp51mB*s)&#j5jrpns|DWa8m|tco>(OgjUN@ z3e%`?AuaRbz$oIus`E@GwT!Mm8_K`cffJ<9ABK4$Y2Y@;)VxOfvq22z)J#a><>H*# z+*B(tkAEZrLev;ZGN^3U+VhE<@Su*Y||6+L~@hHUvnq!W(x-;oF(vz z>L$nAAC~@}rxbZdD+?&BDJA`(kWWN9SBNQpa`f~Tk2@5U0pUX~s7MpCER8QWW)Koe zq63e9%|;6)sR@rVdtUKA=G$1|=(X6xt3UVCN(qd2$;+bL!cN7R_Or3l|NMXRm>@Z&p{)2ha?@4Zi(_Z@;BUu+O)I=U0rejuh1 zg8@ycMFP6Ak-EHW9Qi@TUo6Y`^otQWNND+$UhYp7-tIfX;GiONJ<1uzM{NE)gd~jM z+J4cIQnH^CvcBIGp}p}UJI{qyk7J#E-r$xg4y8wk*>3zL`NuisWC%No`RBkC3D$l z23EbTqzA=V22bl`QT{sA@hM3{nq5i+^o*26+hg6!cbD7tQS+cKFm1Q-;15s~a1@#j z^cSYEtn6}@N8*Ow8_)7WHej^#)TGAoDHBMICGG6d2G1=qXNvI|xcg=|!)$!9smZvT z3`zOanhWUF*sT?0CH(S+~MD+pzDvo{g1a3V@)z zQ;w8%u1wbG0Lq)Ywg*@UoM2|=axY5Tt#wS#VV>2d7Z6N(z6e!mNFjOcHpOf@5=uxg zVfG*>s{Id1B6mbb3@u$sN_xTVQTe}lt36G||u(e>&1Pu1TUewapkBn+%bcrCgCfz7;|lgais2#Es% zh}K4Zl(I@uVcKHQ6g_3=^-5XN5zC23nVw#5wk3mX5#Em8B3dY9H^x{=`JDlU1fx zW>hp?*5(&StXWYStz-l~TRiy5+&}<*n#6P$21OD{Xf$zP&b2&2I_%zu6lVhcbhrvD?DV^YZb&>~jg^3%7TyKdd zN*UclI7Ai`mCN6i4QREa3)vB%a=eYazOyqUkuO?a%Nbrh8jz>|6`IMIDT+o>-japG zhD0QUHrJm5L}#Lyk8}4JAh4(Twzjz2;L~l)Xm7; z8H-dn*L6exx-57YXJg99NYoZMixsbsm8Ve35K%(8)pJW-dw}6`$t@H1)3Z>?G(VVC z&MC0ZiVOj-20d8O#$~sRtZYx;C%3Pa-yO*&5Cv>wZ*eF!>>h65wnhwEim^8*6kc%d z>%dG@LB}NlbK@x_K2nS>N(-bmAuEtV2ct4?j{-VSDRmmnpU1E7|y8EXUh zQX&-+QsY53lFRDvzcfWno{zZWbNGw;CfC5pN*+Ty3-Alb_6&^dMqB87Xwd&)?N66e zF*h(*xa5Lt6Q66l!6P!jr{VrY2!2}Con27skIE|XrnSW z{)5m>xS{UO7TwMb%kdJ+C8{`BhX(dyQWdZ*+1NL^}P^1H8q(=SRP>4* z{H_@75hBCkC`JTFwc@mvH|m7_hU>d1{>kl*j}9ml1P^P07Omrt2`XzbCHVA#tT_cYdH9I}YXBuHO5pV zn62)&Iu&(9vk1jP^Q^Jb3#{<=uCAeVZ7f+(nfbA_uz`e=t_db_%g;hN`QvTEHI#CV z?>?dS2`^3@U&o1Qtpmy5j{u03NqNTs{w3iV3IPRi$_D2Fj;djSJbLc~D(RO73W0es zC{0w-Y8mnhjh6S(cy`R~>}JJ)+E#eMn>UP}DHmjP3}tgsOhp?Cnn^NL6`Me}l|uKT z)X4r0FKOrdXK#vA-R@O_}}%paIV{*7*>xnp+~)s3QDh! zP0SBbO|#wdbU_f}Fz?=j_x_`F)P39Hglxy&hdN3yvHRc+QqoceIQ8wrg`=!O=N)WHhPnQzwY>k18SU?1>w3P* zCPQjtSA;=G;xazO%N&uQT(lM)me3_!>yS_`-FC~!{04kN35oPSxuz)Mw1}e{{6lfZ zM7K;lRt`@UcEDzM$iK<;3J=DhQU{C^jlQPsS=Gt91ybhxK4`%=Jo$6CQ>g|`!g}z6oXKz-+8GKdUILZ24aE=7p!rdv0i==cZ5()`BW$?2U^*$sOX1 zLHZzryBCrDxg}~S2*9+#sH%}uhR0GLt5HaW;wdb#;SRTsNxBxt7#o=o5yi}&OtO-} z^dcHDT%CQKtx>Z5_!#&>Z?26abkE_>BuY>&98qAjYA}pwQlNE4Yo{RfZ6A!?qxHKm zz0JCY&-{Bn917L6zz93(-}3NqgP<-k1kyti`fB8jE78KLssKvltO2D7Hz_5Z0#8Xu zg6N@w1+Us(E+P7&F;pXfN@8$I0Xdqw=>A{y;vr%(5?BXClq3|zj=WZU&0v$FWYTmA zV7R8J#3W5XYdf#=*OjH^4MILG?R*pQP+VUjSsdMv_PxF#ECp31)N8e?`5A=T9B_lD zzKzv(SCQYRIW@HCNyR9c#MIfRRuu;DidIj#?m=&x-u|9GD1w05<)gHe)4Ly^J47*& zUUsunR7G>Q=Zp6I+Ukn9%GnS3RivxiRACuZ<>g_Q*#ys5bI;O4w~pVo%XbEOw+`2M zd;703abU6Zi_vTD3Sird!FP+3$nl^nt_9_qa3OPzik7!o z4e7q>UgwR)35!8W5GVNXLrZ)e6@i;Zh74Ra1h#BU6|oqb#Y5*-epp`m6)r0jGOQzA z{cv%dnp)Hva{mlZdU!BXtB6|uw*TOmz`Phn)MM9j%QsgElC?Dqqa#us)KJ`A?Rnx2FP0LQSo|em|5{Wr@A_(^{!RbEX zuMykpUE>;;wP5GdmX(Mil=i~LX@VeVRZqKO(G2#DgAQFo-ky#@dpA}T)8>0{KD$;N zJa!lvx62E)O#wy1MJB8mIHToDY_QM6hkJ_VkcTl(2R)@V%g0gl?r)^%es3;d7x3ue z_iI(t-at0&Y{v0T1gYX4l3;0tT1~0HZ^-Y%D@SL{B{7f>sOY)~6AaV9bFR;A=m@A0 zSG72`cp*JKlis(VioXRX+%CWCxRt~nKH@96pXygfAgs8{DARp5&*MFotfy^QQMjio z?E_H_H|jN`@8+zF?jWnh!jL|_7a12bP=`C0Ar*E*xKve8Y#Dpx#j)dKhSjAJ-7Z`~ zqQWf8s8!qUI;2#u3>lQZv@l3GaA8vt1Sn!8n?M|?QabKP;-leLl`2|b-T9#CwG5)C z2fVt0Ih>v1@pco8y7yeI`(h;OIy09?yJmyC{PBT2%z_=xeu5WNgo%3*;``Ja#y(j& zEUG5AforJ)r;}WoS7duhA(B~cR-Vv$LO==+$?&J;@o0I?P2;9EWUe6cpdJztC8;Hz z07uk+d!=kgwy11Au!<3J%y-$*>x}sEdRW3df5^o=J^|ehq2LB0jwgKn-5C$ZVvV_p z1>G`Gi`|xW9!P8RosEBbL51MA3##(rc;Bm@UjP&3j8LzA$} zns|&#O85Op2U5DDUKj_)EK8L)tj3!o% z09+buu0IonM6u4EMX*JY;PXgCr{d)|&kc4+NR|vNo8i`-SM9H(9fI>Wn@0QJ>AX>< zQ;o>SM4`$n@63Mir5P|T;?!jCG-?z>EDSKVX4RnrAKlS@wLJz2+fA2?ppYWtw_d1W zJt^;jSqHoy9@);xv&nIZ?NJ7}8H1gqTzJ&Xu4U1jg*$UggzsKlFz0BwN#S)!l4<;T z^M|5fLuf4NW|@pkkjwK3Z1hmBTChSxzNb)b7d9**_YVH8Wjm!wn^TRIDi`rdyQVZ) zB93AljGNxU+}Y8=sOS`>k#-9X!8)N>__9A-wrLLhzkeo=2K)Y^DT)Rym3mfkyzs+w z;MFD2Wg?}WUoWVpX@_?X)UI7#ecRW2&U#vUAG}&h*)ZpbST0&BMw}acklRL+j6SHO zxv=^B!Lulaf{7n~c-efs){XX&y31X}S3~3rg60&{W!}3=`0O+!5H&L$$6p;pxX?cr zL{fg7=FIDk0o#I!Z^2IjKzmu*LrF$At}}LJdWI0$@7?v{dVe+Pb}gf4_=YD7v1&Ow zpeasRzt@kjTrgkYj*=M37Yp27n}8{VTuE{z`U}oV*Q`pwr$2Oip}5=MFE$GH;~gFQ zW?eH=SjqA0J|7iSxFgiW31|82yrLkS^g!&Xeunk#PFv}*Z80QOSB6{{(m=z_)t!6M zpKB*=49DV9mqx88byDh#(WsiB%t+)h@u?p;Nc)(OX|2v& zT5YOQSr*|kAkMyIJxIEUgg>4kOteNnc^@r!__>; z&8sUx5Hxp}oT+;{rN;(>XN|*XCX6lv_(4N%E8OoGaXH!pD)bCMw+S&+C*(^B^GGcZ zUMR=2j(#dFbgX)>;*A1=SuP=pjTZABiGH_m=KA~;62%E&!>US@_>J@7jvY&tf1>Kg zD68L@QS!ZpBgKNS(xE8ZB5XGcFN|TT8lbu2>#U|Byl(XMYD4@o-xxTv8_50&$Al9E z2A;+9>={qZ4Xec+I@_PY&>6g+-}XC}SD9!cG+}&~eqPZ}cM-9v^YNju0%;PxECrp) zx!D6*Nd(}$b|EtE9ATOJJt!?-yJb}qaECT_a3;xwfII6m#bY>wtV;FvMo$z6#(xag z#})N|15|XiA@Wg^XKr>J_?!crlZg zocf#YI^6nPK;rXb**Xh2^P`j%f}mp}BgRl*FFkMUjDLFy>iOj9vRcF_JtHIKaG9TK zyuL~hlzd=aMdnW614%JR$K?nyW1yXBL{T$9h*rLWuXRyM04xi>&{<)!xNT|7_oRvU zc=53zi!N9qu}3~Z-LBp*IL?P=V*#019nDO5QsXomSjCdt3?Lc`UAJ3yL7?#@zWb|$qUQGVa!u6|n zMu`Y*HFFWwKyDYBW1jlLAKxbI5I)TWTkVf2$x2+0F9I~rhW%Kf<~a1G9RB2XFt@Oo z1|)RAUxt0<$z8<_WM^|-C)FCV_RWckvNV;OT0oAfwI*wKzII|a=}Sd0&fn3pzW4IZxi^|7^ZX!}J}VPQnF3oC#U zN`xe(peo^Q0>bOh6-r1gA{(&thojWeV=nl!DL4BaKF=uc{wh#Cj3T5j;oM^2dtx!3 zQMl-PXoBie7V#0{OOK_jGbE&ZG*1}^!R2&fux~Z9KP9$IgW}90U+OLrzTOc~ zI0+9OSxHmjXA3zhiMvynbNrSX^k>w~-$&ev5;Q;!#S(t22X=r#&K%2R1#CN#T=YT7 zR7-VU?Po>`Y`gM6>u3uFb;h{y$Zeo8vIsE)Kic0Dh&-`jRj9+h{H7&Ns;S2=t-arg z$YoY#c}}&D*FN`pPSkF4@1`|OAlK4N6w+ph~Cn7yIqnN&uqk!g{9z9|=pvwEyr z4DCIrQV`t!Hn*OKc=5^fS~HqvLdyr{`Dd0ADJFob7gW5|DWil)$z+$wn!g#@H9nfh zTudO|Zt_fekPqqN6dtrlDQXkmHOmNU&tyz@s8(1A)QSbMC3shKXCtPZXTdmSwqH2U zTrp>?_};#bS7e1UGR#VVo@P>t_lSq;F=Df|QK4C_YbdZ^wXAh0^pD-hVc|yfU?uFy zBM^dZu=2~xejEAh4EioksE!*nB#QD%^3Ctv8$09WG%h|lBqpDdHPig0!uR1yJg~sx znh9wu@n!F6$m^HQ3<71kw(9I@yUm8?&Tvb);oUQjZ^JA zDPURC{wI{uCpAh;1rq~R;QSuyf$<>;6Dtw7g$FLi9}HYxyv2)Ac=wDlXza_tQE(&U zJV?o#jDe@Och1%;v?xUZg#Nk{pR~QQw(`FcsWoh5od}Vk*ot6oo3TKVaGK2F!V%_a zE}1a75P3Fb$ZI+FfPx@LWf`GG_t8#%?C*XGf3B*Wqa~{A1+v3xB}XGR5VN+>_I>du zw*{yoYfnp@o{|r7M?!Jd&y1EnJY<@z^ApA*k)oK#$!HwuA1?U>wPrYcvg^#x7CmKA z8`-p5_0bV(H(`V01G|z!gN|6NJ@P?MQoV06Y)VziFul* z{^7@-=c|$wSC40wmF3iBF=BSVCZpcHek4%DktwLqYpf?m&Kb~T;sB{z!qr3fn520| zW_svENdCsr2RLi(S7X`VyMJrkzZZz>fhqX4Uo-r8HUtNbJ>;iM`5L%!_Z=#VO3|)b zXh}$N9?V)8Va}?F0`JE}#-kMFNQg;9JRysaH$+wVkc`LC8E9{)f=))tOr5-h+#;bf z``LT1asO#BCa)XGS9QXJVwpt*S((ZOfLlbWiq9y7TpJzi_+u77F(5NJRZ4tPRmLq{ zCA%#pFQGSg+msqAPhu=T6zM}%LQ^wRHeR@6dW}+O+*LJ-7u?k{8n@Wa#?1n25B$i4 zED3|}qqj&7XO2BV9{O@}EOqp{A|fWzZphr;7{72xgYL`ey;o?8=v(9*G2|RHA^%@g zy;jhYL;uO$F?e#XPa-D@TDj=OWhT;z&itQLwfSp5eROqzS?l%kc7>v{e3K!ml-RTy z@7+H25WWxmp-m*H$;(E56A7vY(U0ti+s7Zn$W)KX0rmcMDE>Y25ay;@u(zs=SEHE+ zI=b61($wt_nV8gG9~-Z{ft9&V!wZsrB5vP*3z|x}s;Za31cn#F%StdEOk#WroV^^N zTrG{x%ZC9twfiz|-+qUO(dls3QAT1;Axx@`o$F5n@x(72{ik#lu*JlLb8>d6a`T4W zCXz@*WGXrgY=uSJrq`H&@^bkL47pswp%p9cN-Id|j9ofJs7;0zxgbcQ_B66THI&^n zYObfb=HMcsVJH4seay*F+^mU8bxXg??kb z8gx%+&ikiV7)kLVImcRtKtv#F>uSN|7^|&rV`Fgs&@G(Zb&YODt6GXLkXIFU`=6Yk(kAGtG z_RIKf%`kx+-ebJm?T<}lA_92$&X{U*uyKX>^sIIM@Oup#%q`p^C!O56ASTJb$ zX+#cVn6__ONt86vo}sO74T55<=dSild-eDoqk2E#l-fdBS>^qD9TJN>&pu%MB5zo< z60Cv~AIYiMVxX;p0HWqyy7dM~KuKG{uK&dD4JIgxBiqg*FO6}HT1A?eZZl1E9qhr_JU<6P-MjP*dzT-kk{UHU z_BrVvIdej9G~uL~s5LHGO5XS*s^NOiEnn>hr57?|6oG zsx#iyG*wevdvKvpf?^C^=bkX?CpeiM6fIm#(bKzgy_r1z{rwCLVWFF}P{m^{Q+2zE zi-x1;=!RFnrtUO1k8zSHN=S=~bsOb@QL_fYb7%(`nCpDJ-4?lJ`fSr~+Q{w&HRlhIt-Hx8|K~7>8hB;{n8f?=G5t5jT z4qaU;?K7E~XwjkJDSoNJdc(;jP%%)#@$)b62PH%Aj<9vMf`e<#^CU)d>_z9Uzk;oY z4d&EAP*hsUOJ$@CT88bSg1Vf;56i|dE(7g*3Xd(lchQHpX`^qYEi-shFKB4jz8dOt z%a^}xXs3sX^M^Ch6?GvlG6~PF@bA@RJxi@-lsJ_%6&jWPde8c!Or>jWa}CVdFcA)| ztk9}kbH*3%QxQPa=yemfp)k9c8svu)g1Yv(#*^q*ASil`Jv>8XXYUT5MXdbX(5ar& zn@pOvt(rAyNl!sZ7GA?$5f&1~l!mA2YyAD9v3{FivD~>+yFx=(3A^q`Gf4w~@-l?0 znF{{ie-AtM-{Ph3{Vbr4pyy6M$7~Nn$W=@6d(J-7`nn+|G7kEtHI3;>T#_Xtg5+~5;IM@bn8*9i-UF&R=5;*y!Z zt_~`lJ>wpuZm5E(OUJ;zon@`(fB!lRq0b}v`%feL$3-X9yS#sR+ZmMPbEJA*3r&$( zt0y8QDGjZ?nm4%5ZV?IY{)wk>>E0ZRHqL-JkqldOwio)l8jH8S!B{YZqOSer49l8p}!kM)+f|0o%nmVWo(tVUWGIF!w*~gU` zggy^Luqlk`A|K#uT`fW5e zZM}qXZS-KHB{=xsCFSFJR2IA^bKE5u2j1asAAPNZn@>XDQX3hd#wMF$B z*%|hoI8K{%DR%0r(fFRw+S3tM?aa}plQZnytQewNy6zO;MrCTKfyE2@A}+54>u*Fe z(f!@5RPft?W{}A%#Jq)vap=So#<%HXaN+uEY}|AK<63H?b2C9nIlHKg2{q|F#1-b& z#!L}buVJnq{acUhIET=e;h3;s7?ar{GAst#1~u#IT|9IXG73^;zu`zuPJ_5a?Pb-e zKh=Am5b$iKZ9SZsZJNWZnS*eWx_0~yQ<0ha6Y1z{GPN6P1GQcG-Uwj)??%q*4?RO| zBqgNQtLh%zaTY~6#Zb{z#I)sOp{OK}__#!9HL#B0hS|Gt@$Vbt*fD5F#sKPU+JBlUq`$At^8ICsk_-SEZ{JQx9R$hs~LQg~JDN5q! z?#6f$nS(QT{Biqk06Kfv!?U9ebhYZY$u$Ia1x-g2 z^c>O|vC)ZW+q-3*gi#mhD&O(V=P`6fU#M$xmiOg_g#1t&$t9x7dJ_WCA}X7jQQdZz zYbPI~Fsle=PDU6wx;y{r?&QtZLsUe+9kt3Rvu!?*Y_Il z35Ql3V#>|UtMfZiL-+mDPa&%!gDF3af|`a3arPN3`4=T9MzZ}<#X$*4QN5J0kke@C?N`<{2|uI%C7j-O z4F}d7hqRP9rcCUNj%_*ilCY>0Y#>C1+(OLhWCRQKN^fjhmg#nI1_I)9psK9|6B7-X z7-~XIMIOq^vW)vcGI@A{LK6@e7z^pbVsy1uhnHndLQ5u68_h8#MvkApY#bw%wCd_8 z)YYINph4bKyDni;Dvz4RS$v`2$G~1kATT{6H48I)Er*4@8HP^i!$>Ty{ae*}?wNg8 zATcZ%i#AVZq!k`yHD#c0TGRV`R{xbK%`d^6)svXN6H`;+)=T&`_N+g~q`{#!!$D(v zR!Rtp;xM%_sP#M1d{1}(dyL7;M=}{OC_;lH(PMPoSwg69*s2Mep{cEo(K80~^N7*@ zcOTsLlVfBz{kR7=4?Uo{A*e6K(9dZk|7wyHA*k6rf)bMy!<(?6vk~?< z8LtXULG9~wysOAbz(_?JJ=L?@=s_TU%i4tRYOH2akbmAyrH2mj*o? z3-*v%m8e&fDoOUO-)tr z?xkml4^G6wjng>NB2g9Fb#)SY?l(R`xUl;=T>G^aASn91JG5z4C(%4O`xFBw_uvy0 zg-siay6u8K@3H=e?XYiU4etq+Su&{I&cIsdlL(6DyI(zJ3*OylYA;5T)`Jk#`!79? z#lHYSQJ36}yuSy{?jr z?!))`8_fB25+tg`@#W0A)7qLi;ONOI_l~6m^mIb?E;be*-;{~09Gc&U5{7$>q97bzTC$#r$ zRgE--lVNCJG!ry@eCs(_IhzX*6hnH^@t?F%K~uY|ow5yy!OUtY^}Fax2rBt&jA8N5 zOi=1vI}{Ve+Omfu##affWbti49Sc|dH5<=|`eX09BiOd>8ZKV-!>~c^FrcS1+O#yo zUkC4C;n@&$x6?!)JJnh%1lIy>lfgm6Qs%TWwvkAI15_7&W8jine$i-sQK>JVr)BCPuFkJT5c6GEd%% z2LZTt><+>SIi<#=qd2$HiT9mPFisu3%$MA#)34Xik6);Gc=ZLg&fClLRR52(v0pQ< zyZC1$C^~N)B#lwBb*(h9bmt6~A)1&}McbvLcX9Eb8(6pg9P~|8;o)k7c~iR~C^!Kp z&OgJIWA9<7rwChJd7{=yK~r7=+6uzew5W_iTnbFbqu_KH*z04)h5?XOkU@4{E;kVRYSmn1p(|ex zPd~%7)ngb}gX)k|uIZmd4dst-`r*HAXE9?%O{Y_8Kc-m^>^@Q1Fo78dWDAv^({B`1VPL0JKQ_xzz_q*0U!m4bn1FNG7Ub)SFne^m4%H>B1e1L&Kxs|2z<3 z0g;FaOkjx9wNGpGA`+dXBquKyV*vj@_O1ggimQtr=~!q2q9CAHi4}W|-Poc=V~f4V zB*vB)^;e_D-i;bHYGRDNH^hplC@4*u^xk`s-u`!IcM(F7vLN6)-*??{S%#T6GiUF+ z@7{CAbjh8TUjNICP1l~o-yw+V?`#gy%P2p?S9^_+j%bv~1V3z-LJp%A{{QSl+T0 z{(s@bEw=p{I!!e(a`GT_9@>GC+WRU3O40ax^97{LPRrNYJ~=rRS5KMZ^@BHp*JaRN z5A6mRBa(TS=9gX|A|wh6SBhl%<6;wY_jT#;O~eKz2nn024eKE#Ayp9Edu`?*1p6n8 zNyppWAID~#L2_&gw2kUx+KR6+WL)pu0QIVgTk-0iy+Be#P=|i|3xV#TSh#IE#?Bg2 zqH$~0u^HNSYh7xBx_IO^&g{M>%NYgw)k-fIM5|(V8!=L&Z z01a(5!Kwl-{&5Sk@-i4Te=xpXKdso$CbMZuT1q88ML+A@h(>@Aje>-e7d~5?? z)Wr}}myJXJ(LDv>I$8-Nfw_OTUxDotCrp|*2Cfe7_5LV-7vvMUYyO!b z22bjXs|W7l^)m-NzWoB>evw$bXKpT$l?3EzeY2$GqLTEbv}4&3*xwf%CTiC;myy%e zs&M_wMN(p=&2^Ih51TO%jr6to6V4$u&=k(Y&Ks+a|A0Qjx_t2SzU~2t^p8PYL;|KR z{|a6Ebbyz;Uy*In$cqeTo>4|-7S12Lj@S1^-c!4Qtr>xQTOhx01|y+M`)?sCJO;Lm z&>oz(fRd^b7VVl@prV3^f+UQ6+ye0Qf)yU$v4rWFN61Lc#JC?vVE&rPg&q@mgG5sQ z?7Sj)u#0xgC>24K7LRVcz_InG5f>tOg`}iODh2^n zmyInlWD(#UBE*bN?mmZ5&j?WCgLTK26kCk0)lifud;ORhICQXa{JS_m|x70&?cp|=>KLiW@_ss{sEW(6>UtzyJnqu3TRUb^V zq11-}#}I*_=9f^5-16=#G;CQhd4{i^IO6!0Gw^o^rvH+TJ9UWJ#|SF3s$SRfxIVdQ z#D?wlg#Ke3(g$T9Yisbj$C7|L`%rd$pm(USTu%v_{U;RPZ;+BjhaVY$zQaGPEO2Dg z89_Fs{UBrfdTi+j35s%!&h5V@#FeK1Z$gO(>iInz#Dv6Q`{gx7N)VUL8xO-D8&1Q< z%u!gaWjM`jB_pTMs(xL|LsGd>Ov7T_V~+B{#^Md;FPkcqeUw$+9Ax?&yOzPh2qO`!5P~6)p*B?XoL*L!>slU?%1yIv(vp)@MSzyDW7&g7OWo1oaU~Wy`&b~klXnuOuj~WZa9va_rHeWjvaPpFjNd}gJP z?^Bad+Vk;t@yGlP({ei_R~AU@?%ctfczE$C62gSmfz;D$BeOrNa@?t+NGf;9MN@~z zOezAl7LL$0(5W`Z``yMFSUz?GR*v6-1skSf=!CwX?6s?Gc_NzDk*xCL%a)RkH`MKx zB$Ez&?2Pe`Nm9A;iW&{n!6ALnYSri|L#xg4QX=oxbE~jn-d^na;TSCM+hFO|IaRVd zP)&iad!SIt^74)?QsPo2-9BcAYYI7WlB(jM z%6-E&jdQE*mC7NzTiKxeqm_FXqU(STIPlX69R1}q%+8x*#)_|nf?@Muhow0G+UVQd>OUzz55p6oP;1{|SUX$rrbJ#z267sa@kz+<3V5#Y@ z86&6cntcA{At@4@5mxx(qZ4^yu@v#wj*F02lt;JEMZwqP541k2%IMUm4KmZRFnDxd zm|lJi$ER)C0F=7h&RK}t_Bo!vN*UrPdGM2yLjs$3(*H$`iP}aL2}{dtuN) z7hg>5j}cS)SE^1`Rm7_&Z-lfHhiA?RbpxJ_QNadg$c_;=?Om<()( zk<$jDL-*F7%(UM6kpt{4o#A2YEhsXGktCkZQPTrcMoej+@-tB$k|K4iv9uBDAHms0 z3Lrr>lC=GH!}NTO0px0kxSO z?R$NO{-b-KW6#fO#JN*4uG1@bIJ|Iyn}ZkpTmldi7@zkekyGpvowb8IhnSTT5 zl)4;{_RC#nFRPl4Q&mBk8!-&ko=VFzwEQXP@cd!Ag^y+#LD|ATj9Lo?d3pS@v!FbO zs-8-fuiGmN?4P*`GP?CO3(6EzS719!N4Pq8z{fcN?_!b#H32>2hG^Et5Y5{*MY|rY zK5F?5DGT_z2f^PXNFbsR|8PVFMI$CCUZOOR7c^yi&Flq4>X$DVDMj(SzXC~0OjI<_ z)2gGPjH_;c^3g2gx9;)@o8V&;0KL`?VbEL`va+&JP?Q%MU&<@Up_Yt{kb*$H+Io-Z zQlkpG6XTQd?8Yk?H8&`>3zmnA51iPv(AOgnq23WlPZoJ4q^qxEqzxTCX8KytZ>}ed zA$e?Nl{%rWFCqS62oH`DloVpZW7!u;oY2!QIy?rk;cvmg*({wocE) z#sz!v`p%oYH~w2@G0Z5UN_8;Qde?{?hNAYP-Bbhp#`Y2f|I6BpH7qa+&M)0D=s!vWU^|ZK|{?_|Gfa;5P`VdoqXV7q`$wZ5!bqhHVyowF_>RUc$@;qW)%omvYQ$Ui}p_66J1_ z_h-g^Ti~ZX3rkc1;K;nE)VMT6MMM|b@~NyaJ8gkFb!(%~*e+$>2a(mwdv-{QNrs`3 zUYYhmZP_i%tl{$7O=xu`nK3e>?g$Tx9n6KHxai^T)Pb5VexTI+-$VS%Hs*PEzCt>6_0}A&t{3GD- z#05qz3=tg}gAM~eV^c*Xsh#JW7j96NQG~BoAcjtrj;55mgAY94_(NN}KGf99rU>Ed z5dhm4PVjQ^MW|N`K7yC!~|8fq3vQBn^{lftlLV;sG=wOS7d=~53E z)2rCFMrDDcwHvl9Jb>8X1o|%}?u}#*!wYHcpIrw9v3IqISsx0pcOKRrJ^GnIPC-tH z&tzw1Ls?l#NTie?JSqx(Ms|_vxsn&vw7EV!TzxQd_CTrbo4hOQhYpAdiA6)*2B=%F zcA54QAD0LRYZtgVQetJGkm8UBG!MVY>=L_1wwxxaIZ25hkSCz-G3>$Ec5PL)&Jw21vbTk&6bE}N z7kD~(3r#|E-pe)9>K)b2onzPR4X2FChm4f=b5i)1UDpeTZ&m97Y1YO7ZVuj3dx&-} zKZ-wAo{|t0-Q?XXn4rj$>fWHKCX9QvKy*+X4sQ4h3GoGW(Ei$c0or<+QvLfUww;59 zmPn~pPN8DE+fcehx4|7Sa@GKJ8_^M23fb`T@I#-BP%YWjWQTX4!C0KKKF$PWQ zgL+y*o)yg$Rx)#7WBZEP9AM3oqOvl#1DgQC z*rb_My-<~U_1KhBl#?VUU%54htxb$De0qNjoYV{IhH3~54uz+iACi+(OSPW{jWjWO z$`E|FekN2@l+eCyW2mYMF*}+UA7>WA#(F<915P2;grvm9kV48iE%zSX_;0lyl=N~+ zqnMt5f~BK22wCJ}Y-cutjZ19(IO^i>%zF&^R@|uk#ra=Mnsf?UVEAn{RLqbvhd$nY0 zsZCro8hVELv$m(q9|I-k-I!l~ArO^vT_x03uU(>!x+TT2&1cZuxQUQUM2=NU^R$3<1>#G); zkLAO>utwUVIG3!94El6vCYYUj03KVpAuuqO=CeOAGiKLdC@&DM;v_}wn%RXq%r=+N z|4;3_h@L}C%A5uLu|TzSsmY};u_B4r-+wuYNZ;ICoJY)NR^;?fOi+||aG#lOpV3{> zxmUYFKljzNVL~n7{WBI|WQKL8mrCszJYD=?dFwUS>|dOp>`O>eLh|M;%eUH)#!)>c-VU* zCM*uJ%mr+~yeL}vYh6euB$Jl}BH;>ulHq^D)k5+>!W{lh7xYI{-yYC=f)I@%H5 z(b&J{Bv$SJu38Onc43Zt)eI7;siQbbhe*#k6&ZBC>L^TVtEf>GL)bifgRwJ)!MJ_F=bqwivzJUl(3nu%Jof-cSDnVw+g4bzZI00MPg;0z`6*g9 z6Ws^3*z~VQ&lZS@O@gzHAN2C*Uumo7PCFZNI;j{)C7H93hlrLyD4|fu>eMxEfW22Y zKvkn|l^zU=M}5pthY^1*O_b zurYIlnubcDzY`fAgKa+^M5C5EQmqIePjcIwgJ{*QsW3LN5%G8zoh%{l!a(vSR4+#h z2_%u5kmtN;EpW6rWYCu4W^Mbl!nm2kp`oeH_F4E&?OrELagv->??=MoC+o;`R2H;$VLL0vI?nXO}Xwz6b93L_|5 zn6SKS3oZ4c)dKdd{}cYM!PtIn4cc^;p61t_0Y6FV5Q_qhkrhNrl+TzseJD;RR(+Vu zY!gUOQL{Ftt{4Xu)p}^4RUe891>=QD?~-xm$Tc{zX_rxF11#M!uUv|qPfcIp;*N{h zvuR49KM#*gK%`d`YAY!sDlQS54hoC6q$9abr&mDjtx6GE>@!Y@i9J{|2q{M6>lEh6E8?9!NSLvD8Q!2yxs)@a-?@LHMh_!Qe z!|k;vPCVU(V83wOIr9iFZoNiQOm5Q-KW4j_oviFm9&8M0t(;mMHETDpH8%XYESI2s z-v%NhzVN&*iV5vpaRllORdL*W7Zjw}b!OqP)$p|qpmAu<#v`}_&%FmkS>DEV>et4M zRTD6J`XIR2dLi5|0*!QaAcOZ=qf|J2_ULuET6(~sgP!13weDb4;=gx=dCQJ>t+8fF z@qCk#ngM^O5Zrn0j3>_pWz}EVIB-F=>PS-J8;v5h)T&36-O27L`et0bc{Un0*81p! zuy@r7n4K^eJUyD)g?UEijjK=<{BDRU;w!zMJR3Wz{=LPPMN>65(k}ZFmLAkrw%En}MC5(Zhc>1@3?w(G=+To{N3xv)w z)br87@-0;B)kD3y@6!aRMb`Dd?;(Sc$%xqlv0&}g4@x7421er7soU|}%CT&JAI|T4 zZ03m5*Q^i~C4>Olv9WOClu}JeN_?YHFo9aiEECZbpsc2Z0TX&-=8Eym z+WZYt<5SV7VFMv;AT~N)@Tw?9LceVz{J4E?kuf<^OI7S}nQ)DV&q-)QvJKbaZL;d289RIbpZqdvL~ z?1&*>_m%ookjgjioOui@GaJMP<;HX=uZh->kCF#p(OKdp9GiX2e$Z*IiM4+&Lo;Lj zg6HDshwP-B0!6%3JiK-JLA;Ai!uBg`q^fd2SozIXI6QEnF>1-i%JUBc6_ zOVx78%QXOwFG)Ja3vRYvi15lcGok6qL1qWoO%_-cE(RM9%82}%+1I31zV676MQX-N z)Gz;w6MN6Y!Q2H?SC7NAC1a&}e3ZnxpzklpO3%8>#%y>M7_WKkh3ZA+jJg=&*W=5e z-7tSDhUvK{xWD59{x`$`O6*JURbV3iei?#@#B|~QN@8R__nXhrzjvz=|J}%#cUZpa zU&KZUjrTgTv2f;8QbS4VKR4V+h}vTnS?W%;U8*|upr)-VRPrm=Q5KqLil%Vc!Z&Bi zyHvzSCn7F1zOc5|63>ZNi#AYEbXBgEyPOU=y28Jh^)bBz?%M6X(q=JIb9NylUr4~2<*J)(-cohRZ)4)y2Nu2C6migO3mP=ikm@l~SKld}zC&U}k{?act5V-NHh$FZi$n#rX(~5;`FPyFWC1n* z5X|nT1BD{IZ_lU{9D5!NpU7nBHPOKQnFG+VjefBNwQlPbxH$>RpOmvUgv+yhVkA|_ z%}2#p{g~05zb7;D$>mG23#$C3qF<^JyTER!kScb5M9LJXA5UJu>cc;LFjLz4fdka* z<(~qkrKMxXPlpg07>l2eEW&`%J*0lD7mwV=iFM~`a}68EEmiPdq2QYt#4*n4J#;a; zP5l!6?hWT?Tz3e^yYwvd>DL;wzv&AN)%=PMp^@>}vim0dJcZ`OG`;InSf50WPmH7r zxeN`b+sr1Cgi#?^hSwj%G`%l=+BLsIj(6U`pAp~`LSwAQ#x$%7f2q-Y)JRud30uB2$~Vm` z4oQrlet#N_*S>M6!@SWihPKDhfo-9xQX6KEoN(o;H4Hu zb4(QZ9fC z9iD`ZW`4pnt~g{+SD%Lg5`v;Y_%vpDSA5AcpThP%j1v?mshUhs6ezM|Hn3&Gx|rQt z57D8~CH{R+N8fy{#@{6-uaLbV4?@l313Sc|7(L{xSk zBC5c+G_1Mp13Q0#d+w5l%YL6aiF6PX7FA`pVDFvdck#!893O&8At)06-X$wrOa7~l zI+)Z&2guBVt3|L-%v=%?IuCn4G|!V=esa$_#Dykcz&E`xd{Vy(JN6r=9w0Sd=$>=B zI;~_8QL%9+qI$%Ls?+9s{#ZZEP{@@cxJRboo?8qoJ!9CV5JDc*Vts;S)snrESp8al z_kMivm{i4*6V)qH!sxuZN^;?SMV6*kQyhy&)d*|s_+E#{TDQLQZpHDv|FB&Q=WCAU_bTqr7D z-dmN@DmeIbuQDDETEo6c&v_$tC2Sqv3SIgbLq?|oauP)Kf=2a~@b0ynkjY&ftnb<9 z5|ra>SD0UUAxPd-WE@KPTh?w8g0icwS=HjHAJvPVk5A5w@Q+T8X9PvkG8AYdVRl-r z+|3D!Pxn4Gf}*qtS_$gjx=}r>{;CyJni*0@0U$dIIq~m{nkTi@z_AVgVAH9UMPH;^ zqSdLm*aVzAd_(BzH}BVPD!dr`_P@CS^HbINdN2Z_6^F_E&g7mHhlrpkcZ&kB-MchV z!^W?Tq0*wMh@cp<>M5XcL=d50%1a&VHWSrBSc`nAx2d~GGvQheXg zAty5vIZ4R{enDQQ7Us9FkG)F{3))5!B*rI0x3Lbq-Td+3;xj=cu%hEo58dDTjCw6>`{7{jdoN9Gh4|B{eHTUfP?Zpe zB1DG8!tJFOZPLou9eznUI7!t&$jcT|d7n~z{{|Dix@aF8iETd~LR@qrR8{KYU`+?9_xJL$pMHg-!L+_|@e2nBc2>(ueSLr&EGpcK%iEoQ#YQ{_?=) zsz`PX#Kw7hAtxsXPa9uQ@3~=&pxhmOgwH6|R)l5)4b&+?vw*VT)nf}ve@rSPmkg0NB_p0y z*D9IxF{#oz2a8q#yY^~^l|PJtf_&kX{3OI??-T$>7eDN{^bA>wa_Ba=6GqP(S|;JU zhgY5>J5%rgFLFUQ4o*@v6I6nMx73b3CkNT@-a$r1Mf4Ndl4%eaJR35Z=!Y3qy89OtHUv)iD}ptb?IzppM;}r=X6qVwsQQ$qRQ}H?@UU z3tcQ)KNSu1itoM?5gLP;U6#^Hq6hP$TJkH&Q5C>f_bLX|Py*K+qmYxB#4NEyo|Ic; z3NG4)3d)cxmyRkIK@q4$)4t!PpsRrs%q2{|}P)nKrx(mVFDDvJ@$;io~!U<8%r>tF2Z-oa-f2#Zg{fWe>P z^B&TlX^~g9V$m@4YNdx|Uu}SglW&2kp0}+(ZIa@FH@}J;oTNStZ_3m^Dj5f#1_Np7 z$c`vniIcjOkk>_Vs712|R_H;E7}O4PzV3|;Gk3$;)+6_Q)3c8yQ@yMF(s6K-`ZT;L zT9KH}%uoP8Anc|}<){cc#1sRq(SHqmByA79~DkAsucr$bcaI#Df)m-*W${O^u0LK4$+e;%EZfqjny1o_&LqfF4!NTeD1DN;jVgQ3sNIJ<|0wlwsVcE}GU?7St0AG!zTC zxa5KPMy0|dGDS!f%}`cEw{Fcbd~gS7s@Ey=ak+YiV&yNnns%+ZC&j@@ij!4ch~!WD z%E{Kkcl`{|x2a00_Y<3vi6>t1unI^(8O=@KfVD`SiDq_jB;ZH)tX-+ zj;aP8ld5Eh(-XGQwvl$7G7%I(U0Due+t$bS5lzrXF%^I8xPn6`A0a8Fbjm88KH-wh z%lztaaFXJXz?%LEPe>EG=av`k8`i=4AqMD>m52@N&cXb7$!Ba%Z-XS87x`u3;3UN% z@usMN?^Rq<1}?velG=W|G_H&F{dM8?$Oe0l+(SmzheZtDyz!$KcXmup{K{}}lH$mV zpO|G|vkQZDz&ok!l=RZ)bkjj6nG~$scoETYNkzMJcD{&>6n=h*Uk(mVQXK#Frk=8w zU#WEFcXK~Dh9*gM|K0WLVtR{u*s|p^!lDui{mjFsx&1z`^DDx^Ns6N|P&&dNDe2kR zYUYbao^ev$Idw>$-$4_*cHbxvP^VR0r#FHErw=1cTYf1xI7x97=1t`=Tf{b3r)Oqo zAAS;mZ4Uzwm7F29ztKcP5#yVwVBa72^1W~Q$_q&eX?&{3!AXjvL}ay6rrP{0L?mb zsH-T8^vrC8zstabw=uBukEf4QElw-3C?5w0Cn=5!AuDp5`iecP(acJ!f=6?`F^r(@ z@=@gAB*jr#ghipLPRyu_Lq}%R&R9=OT=L?Ng))1@Y%3!uS3YVSoTNB7I5;>-ad2>O zlH%at;NT?1!NI}7Ns5DmgOd~o2L}fyDGm+}PEs5k92`{x{}*5Yoi5uXc|y+l00000 LNkvXXu0mjfD4>h= diff --git a/src/paperless_tesseract/tests/test_ocr.py b/src/paperless_tesseract/tests/test_ocr.py index e0d5726ba..7124fbed6 100644 --- a/src/paperless_tesseract/tests/test_ocr.py +++ b/src/paperless_tesseract/tests/test_ocr.py @@ -1,34 +1,9 @@ import os from unittest import mock, skipIf -import pyocr from django.test import TestCase -from pyocr.libtesseract.tesseract_raw import \ - TesseractError as OtherTesseractError -from ..parsers import image_to_string, strip_excess_whitespace - - -class FakeTesseract(object): - - @staticmethod - def can_detect_orientation(): - return True - - @staticmethod - def detect_orientation(file_handle, lang): - raise OtherTesseractError("arbitrary status", "message") - - @staticmethod - def image_to_string(file_handle, lang): - return "This is test text" - - -class FakePyOcr(object): - - @staticmethod - def get_available_tools(): - return [FakeTesseract] +from ..parsers import strip_excess_whitespace class TestOCR(TestCase): @@ -45,9 +20,6 @@ class TestOCR(TestCase): ) ] - SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") - TESSERACT_INSTALLED = bool(pyocr.get_available_tools()) - def test_strip_excess_whitespace(self): for source, result in self.text_cases: actual_result = strip_excess_whitespace(source) @@ -60,17 +32,3 @@ class TestOCR(TestCase): actual_result ) ) - - @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping") - @mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) - def test_image_to_string_with_text_free_page(self): - """ - This test is sort of silly, since it's really just reproducing an odd - exception thrown by pyocr when it encounters a page with no text. - Actually running this test against an installation of Tesseract results - in a segmentation fault rooted somewhere deep inside pyocr where I - don't care to dig. Regardless, if you run the consumer normally, - text-free pages are now handled correctly so long as we work around - this weird exception. - """ - image_to_string([os.path.join(self.SAMPLE_FILES, "no-text.png"), "en"]) diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 6d4323fc2..bc37b0b84 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -6,41 +6,13 @@ from typing import ContextManager from unittest import mock from django.test import TestCase, override_settings -from pyocr.error import TesseractError from documents.parsers import ParseError, run_convert -from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf, image_to_string, OCRError +from paperless_tesseract.parsers import RasterisedDocumentParser, get_text_from_pdf image_to_string_calls = [] -class FakeTesseract(object): - - @staticmethod - def can_detect_orientation(): - return True - - @staticmethod - def detect_orientation(file_handle, lang): - raise TesseractError("arbitrary status", "message") - - @staticmethod - def get_available_languages(): - return ['eng', 'deu'] - - @staticmethod - def image_to_string(file_handle, lang): - image_to_string_calls.append((file_handle.name, lang)) - return file_handle.read() - - -class FakePyOcr(object): - - @staticmethod - def get_available_tools(): - return [FakeTesseract] - - def fake_convert(input_file, output_file, **kwargs): with open(input_file) as f: lines = f.readlines() @@ -50,12 +22,6 @@ def fake_convert(input_file, output_file, **kwargs): f2.write(line.strip()) -def fake_unpaper(pnm): - output = pnm + ".unpaper.pnm" - shutil.copy(pnm, output) - return output - - class FakeImageFile(ContextManager): def __init__(self, fname): self.fname = fname @@ -67,92 +33,6 @@ class FakeImageFile(ContextManager): return os.path.basename(self.fname) -fake_image = FakeImageFile - - -@mock.patch("paperless_tesseract.parsers.pyocr", FakePyOcr) -@mock.patch("paperless_tesseract.parsers.run_convert", fake_convert) -@mock.patch("paperless_tesseract.parsers.run_unpaper", fake_unpaper) -@mock.patch("paperless_tesseract.parsers.Image.open", open) -class TestRasterisedDocumentParser(TestCase): - - def setUp(self): - self.scratch = tempfile.mkdtemp() - - global image_to_string_calls - - image_to_string_calls = [] - - override_settings(OCR_LANGUAGE="eng", SCRATCH_DIR=self.scratch).enable() - - def tearDown(self): - shutil.rmtree(self.scratch) - - def get_input_file(self, pages): - _, fname = tempfile.mkstemp(suffix=".pdf", dir=self.scratch) - with open(fname, "w") as f: - f.writelines([f"line {p}\n" for p in range(pages)]) - return fname - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_simple_language_match(self): - parser = RasterisedDocumentParser(self.get_input_file(1), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_2_pages(self): - parser = RasterisedDocumentParser(self.get_input_file(2), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "en") - def test_parse_text_3_pages(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: None) - def test_parse_text_lang_detect_failed(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "it") - def test_parse_text_lang_not_installed(self): - parser = RasterisedDocumentParser(self.get_input_file(4), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2 line 3") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "eng", "eng", "eng"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") - def test_parse_text_lang_mismatch(self): - parser = RasterisedDocumentParser(self.get_input_file(3), uuid.uuid4()) - text = parser.get_text() - self.assertEqual(text, "line 0 line 1 line 2") - - self.assertListEqual([args[1] for args in image_to_string_calls], ["eng", "deu", "deu", "deu"]) - - @mock.patch("paperless_tesseract.parsers.langdetect.detect", lambda _: "de") - def test_parse_empty_doc(self): - parser = RasterisedDocumentParser(self.get_input_file(0), uuid.uuid4()) - try: - parser.get_text() - except ParseError as e: - self.assertEqual("Empty document, nothing to do.", str(e)) - else: - self.fail("Should raise exception") - - class TestAuxilliaryFunctions(TestCase): def setUp(self): @@ -173,32 +53,7 @@ class TestAuxilliaryFunctions(TestCase): def test_get_text_from_pdf_error(self): text = get_text_from_pdf(os.path.join(self.SAMPLE_FILES, 'simple.png')) - self.assertEqual(text.strip(), "") - - def test_image_to_string(self): - text = image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "eng")) - - self.assertEqual(text, "This is a test document.") - - def test_image_to_string_language_unavailable(self): - try: - image_to_string((os.path.join(self.SAMPLE_FILES, 'simple.png'), "ita")) - except OCRError as e: - self.assertTrue("Failed loading language" in str(e)) - else: - self.fail("Should raise exception") - - @override_settings(OCR_ALWAYS=False) - @mock.patch("paperless_tesseract.parsers.get_text_from_pdf") - @mock.patch("paperless_tesseract.parsers.RasterisedDocumentParser._get_greyscale") - def test_is_ocred(self, m2, m): - parser = RasterisedDocumentParser("", uuid.uuid4()) - m.return_value = "lots of text lots of text lots of text lots of text lots of text lots of text " \ - "lots of text lots of text lots of text lots of text lots of text lots of text " \ - "lots of text lots of text lots of text lots of text lots of text lots of text " - parser.get_text() - self.assertEqual(m.call_count, 2) - self.assertEqual(m2.call_count, 0) + self.assertIsNone(text) def test_thumbnail(self): parser = RasterisedDocumentParser(os.path.join(self.SAMPLE_FILES, 'simple.pdf'), uuid.uuid4()) From cb9e5b5ee39f5c8ef7084e8436ad993815ebeb6b Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 17:18:57 +0100 Subject: [PATCH 07/42] Add metadata field: has archive version --- src/documents/views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/documents/views.py b/src/documents/views.py index 4d62ae5c4..381eba3e1 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -169,6 +169,7 @@ class DocumentViewSet(RetrieveModelMixin, "paperless__checksum": doc.checksum, "paperless__mime_type": doc.mime_type, "paperless__filename": doc.filename, + "paperless__has_archive_version": os.path.isfile(doc.archive_path) }) except Document.DoesNotExist: raise Http404() From b1110f7291159baf0e09fb00bde76a70560001a8 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 17:22:51 +0100 Subject: [PATCH 08/42] update git ignore --- .gitignore | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 4ae903ade..d63794fb3 100644 --- a/.gitignore +++ b/.gitignore @@ -76,16 +76,11 @@ scripts/nuke /static/ # Stored PDFs -/media/documents/originals/* -/media/documents/thumbnails/* - -/data/classification_model.pickle -/data/db.sqlite3 -/data/index - +/media/ +/data/ /paperless.conf -/consume -/export +/consume/ +/export/ /src-ui/.vscode # this is where the compiled frontend is moved to. From 81aaadb2a396a3b5a42512cd0b2cebf737dd886d Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 17:23:57 +0100 Subject: [PATCH 09/42] codestyle --- src/documents/views.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/documents/views.py b/src/documents/views.py index 381eba3e1..87d1d31b1 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -138,7 +138,7 @@ class DocumentViewSet(RetrieveModelMixin, def file_response(self, pk, request, disposition): doc = Document.objects.get(id=pk) mime_type = doc.mime_type - if not self.original_requested(request) and os.path.isfile(doc.archive_path): + if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501 file_handle = doc.archive_file mime_type = 'application/pdf' elif doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: @@ -169,7 +169,8 @@ class DocumentViewSet(RetrieveModelMixin, "paperless__checksum": doc.checksum, "paperless__mime_type": doc.mime_type, "paperless__filename": doc.filename, - "paperless__has_archive_version": os.path.isfile(doc.archive_path) + "paperless__has_archive_version": + os.path.isfile(doc.archive_path) }) except Document.DoesNotExist: raise Http404() From 64180b56684bc180535da8ede6390791527c0202 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 17:28:49 +0100 Subject: [PATCH 10/42] fixed up a test case --- src/documents/tests/test_document_model.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/documents/tests/test_document_model.py b/src/documents/tests/test_document_model.py index 5b27e2643..49e7139c8 100644 --- a/src/documents/tests/test_document_model.py +++ b/src/documents/tests/test_document_model.py @@ -1,12 +1,29 @@ +import os +import shutil +import tempfile +from pathlib import Path from unittest import mock -from django.test import TestCase +from django.test import TestCase, override_settings from ..models import Document, Correspondent class TestDocument(TestCase): + def setUp(self) -> None: + self.originals_dir = tempfile.mkdtemp() + self.thumb_dir = tempfile.mkdtemp() + + override_settings( + ORIGINALS_DIR=self.originals_dir, + THUMBNAIL_DIR=self.thumb_dir, + ).enable() + + def tearDown(self) -> None: + shutil.rmtree(self.originals_dir) + shutil.rmtree(self.thumb_dir) + def test_file_deletion(self): document = Document.objects.create( correspondent=Correspondent.objects.create(name="Test0"), @@ -19,6 +36,9 @@ class TestDocument(TestCase): file_path = document.source_path thumb_path = document.thumbnail_path + Path(file_path).touch() + Path(thumb_path).touch() + with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink: document.delete() mock_unlink.assert_any_call(file_path) From 17a581495ccba3f351ec9e6000dc9fdd5052f909 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 18:01:29 +0100 Subject: [PATCH 11/42] proper filenames for originals and archived documents --- src/documents/models.py | 4 ++++ src/documents/views.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/documents/models.py b/src/documents/models.py index c1ab9a44d..2644657a3 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -241,6 +241,10 @@ class Document(models.Model): def file_name(self): return slugify(str(self)) + self.file_type + @property + def archive_file_name(self): + return slugify(str(self)) + ".pdf" + @property def file_type(self): return mimetypes.guess_extension(str(self.mime_type)) diff --git a/src/documents/views.py b/src/documents/views.py index 87d1d31b1..457fa9dc7 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -138,8 +138,10 @@ class DocumentViewSet(RetrieveModelMixin, def file_response(self, pk, request, disposition): doc = Document.objects.get(id=pk) mime_type = doc.mime_type + filename = doc.file_name if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501 file_handle = doc.archive_file + filename = doc.archive_file_name mime_type = 'application/pdf' elif doc.storage_type == Document.STORAGE_TYPE_UNENCRYPTED: file_handle = doc.source_file @@ -148,7 +150,7 @@ class DocumentViewSet(RetrieveModelMixin, response = HttpResponse(file_handle, content_type=mime_type) response["Content-Disposition"] = '{}; filename="{}"'.format( - disposition, doc.file_name) + disposition, filename) return response @action(methods=['post'], detail=False) From fe7aa10d2c37371161ac8dc9bb675e088c650758 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Wed, 25 Nov 2020 18:01:43 +0100 Subject: [PATCH 12/42] frontend support for downloading originals --- .../document-detail.component.html | 26 ++++++++++++++----- .../document-detail.component.ts | 7 +++++ .../app/data/paperless-document-metadata.ts | 11 ++++++++ .../src/app/services/rest/document.service.ts | 21 ++++++++++++--- 4 files changed, 55 insertions(+), 10 deletions(-) create mode 100644 src-ui/src/app/data/paperless-document-metadata.ts diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index 9e1f8ad71..474c1376d 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -5,12 +5,26 @@ Delete - - - - - Download - + + + +