diff --git a/docs/advanced_usage.md b/docs/advanced_usage.md index 957d5287e..e279e48af 100644 --- a/docs/advanced_usage.md +++ b/docs/advanced_usage.md @@ -126,6 +126,7 @@ script can access the following relevant environment variables set: | ----------------------- | ------------------------------------------------------------ | | `DOCUMENT_SOURCE_PATH` | Original path of the consumed document | | `DOCUMENT_WORKING_PATH` | Path to a copy of the original that consumption will work on | +| `TASK_ID` | UUID of the task used to process the new document (if any) | !!! note @@ -168,21 +169,22 @@ Executed after the consumer has successfully processed a document and has moved it into paperless. It receives the following environment variables: -| Environment Variable | Description | -| ---------------------------- | --------------------------------------------- | -| `DOCUMENT_ID` | Database primary key of the document | -| `DOCUMENT_FILE_NAME` | Formatted filename, not including paths | -| `DOCUMENT_CREATED` | Date & time when document created | -| `DOCUMENT_MODIFIED` | Date & time when document was last modified | -| `DOCUMENT_ADDED` | Date & time when document was added | -| `DOCUMENT_SOURCE_PATH` | Path to the original document file | -| `DOCUMENT_ARCHIVE_PATH` | Path to the generate archive file (if any) | -| `DOCUMENT_THUMBNAIL_PATH` | Path to the generated thumbnail | -| `DOCUMENT_DOWNLOAD_URL` | URL for document download | -| `DOCUMENT_THUMBNAIL_URL` | URL for the document thumbnail | -| `DOCUMENT_CORRESPONDENT` | Assigned correspondent (if any) | -| `DOCUMENT_TAGS` | Comma separated list of tags applied (if any) | -| `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document | +| Environment Variable | Description | +| ---------------------------- | ---------------------------------------------- | +| `DOCUMENT_ID` | Database primary key of the document | +| `DOCUMENT_FILE_NAME` | Formatted filename, not including paths | +| `DOCUMENT_CREATED` | Date & time when document created | +| `DOCUMENT_MODIFIED` | Date & time when document was last modified | +| `DOCUMENT_ADDED` | Date & time when document was added | +| `DOCUMENT_SOURCE_PATH` | Path to the original document file | +| `DOCUMENT_ARCHIVE_PATH` | Path to the generate archive file (if any) | +| `DOCUMENT_THUMBNAIL_PATH` | Path to the generated thumbnail | +| `DOCUMENT_DOWNLOAD_URL` | URL for document download | +| `DOCUMENT_THUMBNAIL_URL` | URL for the document thumbnail | +| `DOCUMENT_CORRESPONDENT` | Assigned correspondent (if any) | +| `DOCUMENT_TAGS` | Comma separated list of tags applied (if any) | +| `DOCUMENT_ORIGINAL_FILENAME` | Filename of original document | +| `TASK_ID` | Task UUID used to import the document (if any) | The script can be in any language, A simple shell script example: diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 08d073a4b..a44de6eca 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -209,6 +209,7 @@ class Consumer(LoggingMixin): script_env = os.environ.copy() script_env["DOCUMENT_SOURCE_PATH"] = original_file_path script_env["DOCUMENT_WORKING_PATH"] = working_file_path + script_env["TASK_ID"] = self.task_id or "" try: completed_proc = run( @@ -279,6 +280,7 @@ class Consumer(LoggingMixin): ",".join(document.tags.all().values_list("name", flat=True)), ) script_env["DOCUMENT_ORIGINAL_FILENAME"] = str(document.original_filename) + script_env["TASK_ID"] = self.task_id or "" try: completed_proc = run( diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 2dbc9d6eb..0f16b717c 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -7,6 +7,7 @@ from typing import Type import tqdm from asgiref.sync import async_to_sync +from celery import Task from celery import shared_task from channels.layers import get_channel_layer from django.conf import settings @@ -91,8 +92,9 @@ def train_classifier(): logger.warning("Classifier error: " + str(e)) -@shared_task +@shared_task(bind=True) def consume_file( + self: Task, input_doc: ConsumableDocument, overrides: Optional[DocumentMetadataOverrides] = None, ): @@ -163,6 +165,7 @@ def consume_file( override_created=overrides.created, override_asn=overrides.asn, override_owner_id=overrides.owner_id, + task_id=self.request.id, ) if document: diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index f0e5421cf..8d5f220fc 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -4,6 +4,7 @@ import re import shutil import stat import tempfile +import uuid from unittest import mock from unittest.mock import MagicMock @@ -862,6 +863,7 @@ class PreConsumeTestCase(TestCase): c = Consumer() c.original_path = "path-to-file" c.path = "/tmp/somewhere/path-to-file" + c.task_id = str(uuid.uuid4()) c.run_pre_consume_script() m.assert_called_once() @@ -877,6 +879,7 @@ class PreConsumeTestCase(TestCase): subset = { "DOCUMENT_SOURCE_PATH": c.original_path, "DOCUMENT_WORKING_PATH": c.path, + "TASK_ID": c.task_id, } self.assertDictEqual(environment, {**environment, **subset}) @@ -937,7 +940,10 @@ class PreConsumeTestCase(TestCase): with override_settings(PRE_CONSUME_SCRIPT=script.name): c = Consumer() c.path = "path-to-file" - self.assertRaises(ConsumerError, c.run_pre_consume_script) + self.assertRaises( + ConsumerError, + c.run_pre_consume_script, + ) class PostConsumeTestCase(TestCase): @@ -968,7 +974,11 @@ class PostConsumeTestCase(TestCase): doc = Document.objects.create(title="Test", mime_type="application/pdf") c = Consumer() c.filename = "somefile.pdf" - self.assertRaises(ConsumerError, c.run_post_consume_script, doc) + self.assertRaises( + ConsumerError, + c.run_post_consume_script, + doc, + ) @mock.patch("documents.consumer.run") def test_post_consume_script_simple(self, m): @@ -995,7 +1005,9 @@ class PostConsumeTestCase(TestCase): doc.tags.add(tag1) doc.tags.add(tag2) - Consumer().run_post_consume_script(doc) + consumer = Consumer() + consumer.task_id = str(uuid.uuid4()) + consumer.run_post_consume_script(doc) m.assert_called_once() @@ -1017,6 +1029,7 @@ class PostConsumeTestCase(TestCase): "DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/", "DOCUMENT_CORRESPONDENT": "my_bank", "DOCUMENT_TAGS": "a,b", + "TASK_ID": consumer.task_id, } self.assertDictEqual(environment, {**environment, **subset})