From b0625cdced9c79f5a3d8931060ba65faadb0e733 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 8 Dec 2022 18:39:14 -0800 Subject: [PATCH 1/4] Updates runners to Ubuntu 22.04, locks pipenv to 2022.11.30, reduces number of image cleans run --- .github/workflows/ci.yml | 24 +++++++++---------- .github/workflows/cleanup-tags.yml | 7 +----- .github/workflows/codeql-analysis.yml | 2 +- .github/workflows/installer-library.yml | 2 +- .github/workflows/project-actions.yml | 4 ++-- .github/workflows/release-chart.yml | 2 +- .../workflows/reusable-workflow-builder.yml | 2 +- Dockerfile | 2 +- 8 files changed, 20 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 23ace6a3a..6a12189a5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ on: jobs: pre-commit: name: Linting Checks - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout repository @@ -34,7 +34,7 @@ jobs: documentation: name: "Build Documentation" - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - pre-commit steps: @@ -44,7 +44,7 @@ jobs: - name: Install pipenv run: | - pipx install pipenv==2022.10.12 + pipx install pipenv==2022.11.30 - name: Set up Python uses: actions/setup-python@v4 @@ -73,7 +73,7 @@ jobs: documentation-deploy: name: "Deploy Documentation" - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 if: github.event_name == 'push' && github.ref == 'refs/heads/main' needs: - documentation @@ -92,7 +92,7 @@ jobs: tests-backend: name: "Tests (${{ matrix.python-version }})" - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - pre-commit strategy: @@ -124,7 +124,7 @@ jobs: - name: Install pipenv run: | - pipx install pipenv==2022.10.12 + pipx install pipenv==2022.11.30 - name: Set up Python uses: actions/setup-python@v4 @@ -181,7 +181,7 @@ jobs: tests-frontend: name: "Tests Frontend" - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - pre-commit strategy: @@ -201,7 +201,7 @@ jobs: prepare-docker-build: name: Prepare Docker Pipeline Data if: github.event_name == 'push' && (startsWith(github.ref, 'refs/heads/feature-') || github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/beta' || contains(github.ref, 'beta.rc') || startsWith(github.ref, 'refs/tags/v')) - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 # If the push triggered the installer library workflow, wait for it to # complete here. This ensures the required versions for the final # image have been built, while not waiting at all if the versions haven't changed @@ -278,7 +278,7 @@ jobs: # build and push image to docker hub. build-docker-image: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 concurrency: group: ${{ github.workflow }}-build-docker-image-${{ github.ref_name }} cancel-in-progress: true @@ -383,7 +383,7 @@ jobs: build-release: needs: - build-docker-image - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout @@ -462,7 +462,7 @@ jobs: path: dist/paperless-ngx.tar.xz publish-release: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 outputs: prerelease: ${{ steps.get_version.outputs.prerelease }} changelog: ${{ steps.create-release.outputs.body }} @@ -511,7 +511,7 @@ jobs: asset_content_type: application/x-xz append-changelog: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: - publish-release if: needs.publish-release.outputs.prerelease == 'false' diff --git a/.github/workflows/cleanup-tags.yml b/.github/workflows/cleanup-tags.yml index 2b63b3000..4a5deeb9c 100644 --- a/.github/workflows/cleanup-tags.yml +++ b/.github/workflows/cleanup-tags.yml @@ -6,12 +6,7 @@ name: Cleanup Image Tags on: - schedule: - - cron: '0 0 * * SAT' delete: - pull_request: - types: - - closed push: paths: - ".github/workflows/cleanup-tags.yml" @@ -26,7 +21,7 @@ concurrency: jobs: cleanup-images: name: Cleanup Image Tags for ${{ matrix.primary-name }} - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 strategy: matrix: include: diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index c742807e9..e1c6eebb8 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -23,7 +23,7 @@ on: jobs: analyze: name: Analyze - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: actions: read contents: read diff --git a/.github/workflows/installer-library.yml b/.github/workflows/installer-library.yml index f761b0315..33e81a3d1 100644 --- a/.github/workflows/installer-library.yml +++ b/.github/workflows/installer-library.yml @@ -34,7 +34,7 @@ concurrency: jobs: prepare-docker-build: name: Prepare Docker Image Version Data - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Set ghcr repository name diff --git a/.github/workflows/project-actions.yml b/.github/workflows/project-actions.yml index 2d3e4549e..717b3c2d3 100644 --- a/.github/workflows/project-actions.yml +++ b/.github/workflows/project-actions.yml @@ -24,7 +24,7 @@ env: jobs: issue_opened_or_reopened: name: issue_opened_or_reopened - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 if: github.event_name == 'issues' && (github.event.action == 'opened' || github.event.action == 'reopened') steps: - name: Add issue to project and set status to ${{ env.todo }} @@ -37,7 +37,7 @@ jobs: status_value: ${{ env.todo }} # Target status pr_opened_or_reopened: name: pr_opened_or_reopened - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: # write permission is required for autolabeler pull-requests: write diff --git a/.github/workflows/release-chart.yml b/.github/workflows/release-chart.yml index 9e21850ca..a82ac4ed8 100644 --- a/.github/workflows/release-chart.yml +++ b/.github/workflows/release-chart.yml @@ -9,7 +9,7 @@ on: jobs: release_chart: name: "Release Chart" - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/reusable-workflow-builder.yml b/.github/workflows/reusable-workflow-builder.yml index ba1358e56..37d51b122 100644 --- a/.github/workflows/reusable-workflow-builder.yml +++ b/.github/workflows/reusable-workflow-builder.yml @@ -21,7 +21,7 @@ concurrency: jobs: build-image: name: Build ${{ fromJSON(inputs.build-json).name }} @ ${{ fromJSON(inputs.build-json).version }} - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout diff --git a/Dockerfile b/Dockerfile index 32598c483..28764aef3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,7 +45,7 @@ COPY Pipfile* ./ RUN set -eux \ && echo "Installing pipenv" \ - && python3 -m pip install --no-cache-dir --upgrade pipenv==2022.10.12 \ + && python3 -m pip install --no-cache-dir --upgrade pipenv==2022.11.30 \ && echo "Generating requirement.txt" \ && pipenv requirements > requirements.txt From 97d6503fefc5737028637c39a2c1f33dd1e12904 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 22 Nov 2022 09:59:59 -0800 Subject: [PATCH 2/4] Switches task serialization over to pickle format --- ...remove_paperlesstask_task_args_and_more.py | 21 ++++ src/documents/models.py | 14 --- src/documents/signals/handlers.py | 26 ++--- src/documents/tests/test_api.py | 12 --- src/documents/tests/test_task_signals.py | 101 +++++++++++++----- src/paperless/settings.py | 4 + 6 files changed, 109 insertions(+), 69 deletions(-) create mode 100644 src/documents/migrations/1028_remove_paperlesstask_task_args_and_more.py diff --git a/src/documents/migrations/1028_remove_paperlesstask_task_args_and_more.py b/src/documents/migrations/1028_remove_paperlesstask_task_args_and_more.py new file mode 100644 index 000000000..83f3eadfa --- /dev/null +++ b/src/documents/migrations/1028_remove_paperlesstask_task_args_and_more.py @@ -0,0 +1,21 @@ +# Generated by Django 4.1.3 on 2022-11-22 17:50 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("documents", "1027_remove_paperlesstask_attempted_task_and_more"), + ] + + operations = [ + migrations.RemoveField( + model_name="paperlesstask", + name="task_args", + ), + migrations.RemoveField( + model_name="paperlesstask", + name="task_kwargs", + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 1ee6dfedb..a59340e5b 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -560,20 +560,6 @@ class PaperlessTask(models.Model): help_text=_("Name of the Task which was run"), ) - task_args = models.JSONField( - null=True, - verbose_name=_("Task Positional Arguments"), - help_text=_( - "JSON representation of the positional arguments used with the task", - ), - ) - task_kwargs = models.JSONField( - null=True, - verbose_name=_("Task Named Arguments"), - help_text=_( - "JSON representation of the named arguments used with the task", - ), - ) status = models.CharField( max_length=30, default=states.PENDING, diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index c28ea8e2b..4c2554f02 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -1,7 +1,6 @@ import logging import os import shutil -from ast import literal_eval from pathlib import Path from celery import states @@ -521,25 +520,24 @@ def add_to_index(sender, document, **kwargs): def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs): """ Creates the PaperlessTask object in a pending state. This is sent before - the task reaches the broker, but + the task reaches the broker, but before it begins executing on a worker. https://docs.celeryq.dev/en/stable/userguide/signals.html#before-task-publish + https://docs.celeryq.dev/en/stable/internals/protocol.html#version-2 + """ if "task" not in headers or headers["task"] != "documents.tasks.consume_file": # Assumption: this is only ever a v2 message return try: - task_file_name = "" - if headers["kwargsrepr"] is not None: - task_kwargs = literal_eval(headers["kwargsrepr"]) - if "override_filename" in task_kwargs: - task_file_name = task_kwargs["override_filename"] - else: - task_kwargs = None + task_args = body[0] + task_kwargs = body[1] - task_args = literal_eval(headers["argsrepr"]) + task_file_name = "" + if "override_filename" in task_kwargs: + task_file_name = task_kwargs["override_filename"] # Nothing was found, report the task first argument if not len(task_file_name): @@ -552,8 +550,6 @@ def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs): status=states.PENDING, task_file_name=task_file_name, task_name=headers["task"], - task_args=task_args, - task_kwargs=task_kwargs, result=None, date_created=timezone.now(), date_started=None, @@ -562,7 +558,7 @@ def before_task_publish_handler(sender=None, headers=None, body=None, **kwargs): except Exception as e: # pragma: no cover # Don't let an exception in the signal handlers prevent # a document from being consumed. - logger.error(f"Creating PaperlessTask failed: {e}") + logger.error(f"Creating PaperlessTask failed: {e}", exc_info=True) @task_prerun.connect @@ -584,7 +580,7 @@ def task_prerun_handler(sender=None, task_id=None, task=None, **kwargs): except Exception as e: # pragma: no cover # Don't let an exception in the signal handlers prevent # a document from being consumed. - logger.error(f"Setting PaperlessTask started failed: {e}") + logger.error(f"Setting PaperlessTask started failed: {e}", exc_info=True) @task_postrun.connect @@ -607,4 +603,4 @@ def task_postrun_handler( except Exception as e: # pragma: no cover # Don't let an exception in the signal handlers prevent # a document from being consumed. - logger.error(f"Updating PaperlessTask failed: {e}") + logger.error(f"Updating PaperlessTask failed: {e}", exc_info=True) diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index c9d8aefc2..0f890249c 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -3043,16 +3043,6 @@ class TestTasks(APITestCase): task_file_name="test.pdf", task_name="documents.tasks.some_task", status=celery.states.SUCCESS, - task_args=("/tmp/paperless/paperless-upload-5iq7skzc",), - task_kwargs={ - "override_filename": "test.pdf", - "override_title": None, - "override_correspondent_id": None, - "override_document_type_id": None, - "override_tag_ids": None, - "task_id": "466e8fe7-7193-4698-9fff-72f0340e2082", - "override_created": None, - }, ) response = self.client.get(self.ENDPOINT) @@ -3079,8 +3069,6 @@ class TestTasks(APITestCase): task_file_name="anothertest.pdf", task_name="documents.tasks.some_task", status=celery.states.SUCCESS, - task_args=("/consume/anothertest.pdf",), - task_kwargs={"override_tag_ids": None}, ) response = self.client.get(self.ENDPOINT) diff --git a/src/documents/tests/test_task_signals.py b/src/documents/tests/test_task_signals.py index 8aafc1f12..e21879802 100644 --- a/src/documents/tests/test_task_signals.py +++ b/src/documents/tests/test_task_signals.py @@ -28,6 +28,14 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase): "ignore_result": False, } + BODY_CONSUME = ( + # args + ("/consume/hello-999.pdf",), + # kwargs + {"override_tag_ids": None}, + {"callbacks": None, "errbacks": None, "chain": None, "chord": None}, + ) + HEADERS_WEB_UI = { "lang": "py", "task": "documents.tasks.consume_file", @@ -47,64 +55,90 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase): "ignore_result": False, } - def util_call_before_task_publish_handler(self, headers_to_use): + BODY_WEB_UI = ( + # args + ("/tmp/paperless/paperless-upload-st9lmbvx",), + # kwargs + { + "override_filename": "statement.pdf", + "override_title": None, + "override_correspondent_id": None, + "override_document_type_id": None, + "override_tag_ids": None, + "task_id": "f5622ca9-3707-4ed0-b418-9680b912572f", + "override_created": None, + }, + {"callbacks": None, "errbacks": None, "chain": None, "chord": None}, + ) + + def util_call_before_task_publish_handler(self, headers_to_use, body_to_use): + """ + Simple utility to call the pre-run handle and ensure it created a single task + instance + """ self.assertEqual(PaperlessTask.objects.all().count(), 0) - before_task_publish_handler(headers=headers_to_use) + before_task_publish_handler(headers=headers_to_use, body=body_to_use) self.assertEqual(PaperlessTask.objects.all().count(), 1) def test_before_task_publish_handler_consume(self): """ GIVEN: - - A celery task completed with an exception + - A celery task is started via the consume folder WHEN: - - API call is made to get tasks + - Task before publish handler is called THEN: - - The returned result is the exception info + - The task is created and marked as pending """ - self.util_call_before_task_publish_handler(headers_to_use=self.HEADERS_CONSUME) + self.util_call_before_task_publish_handler( + headers_to_use=self.HEADERS_CONSUME, + body_to_use=self.BODY_CONSUME, + ) task = PaperlessTask.objects.get() self.assertIsNotNone(task) self.assertEqual(self.HEADERS_CONSUME["id"], task.task_id) - self.assertListEqual(["/consume/hello-999.pdf"], task.task_args) - self.assertDictEqual({"override_tag_ids": None}, task.task_kwargs) self.assertEqual("hello-999.pdf", task.task_file_name) self.assertEqual("documents.tasks.consume_file", task.task_name) self.assertEqual(celery.states.PENDING, task.status) def test_before_task_publish_handler_webui(self): - - self.util_call_before_task_publish_handler(headers_to_use=self.HEADERS_WEB_UI) + """ + GIVEN: + - A celery task is started via the web ui + WHEN: + - Task before publish handler is called + THEN: + - The task is created and marked as pending + """ + self.util_call_before_task_publish_handler( + headers_to_use=self.HEADERS_WEB_UI, + body_to_use=self.BODY_WEB_UI, + ) task = PaperlessTask.objects.get() self.assertIsNotNone(task) self.assertEqual(self.HEADERS_WEB_UI["id"], task.task_id) - self.assertListEqual( - ["/tmp/paperless/paperless-upload-st9lmbvx"], - task.task_args, - ) - self.assertDictEqual( - { - "override_filename": "statement.pdf", - "override_title": None, - "override_correspondent_id": None, - "override_document_type_id": None, - "override_tag_ids": None, - "task_id": "f5622ca9-3707-4ed0-b418-9680b912572f", - "override_created": None, - }, - task.task_kwargs, - ) self.assertEqual("statement.pdf", task.task_file_name) self.assertEqual("documents.tasks.consume_file", task.task_name) self.assertEqual(celery.states.PENDING, task.status) def test_task_prerun_handler(self): - self.util_call_before_task_publish_handler(headers_to_use=self.HEADERS_CONSUME) + """ + GIVEN: + - A celery task is started via the consume folder + WHEN: + - Task starts execution + THEN: + - The task is marked as started + """ + self.util_call_before_task_publish_handler( + headers_to_use=self.HEADERS_CONSUME, + body_to_use=self.BODY_CONSUME, + ) task_prerun_handler(task_id=self.HEADERS_CONSUME["id"]) @@ -113,7 +147,18 @@ class TestTaskSignalHandler(DirectoriesMixin, TestCase): self.assertEqual(celery.states.STARTED, task.status) def test_task_postrun_handler(self): - self.util_call_before_task_publish_handler(headers_to_use=self.HEADERS_CONSUME) + """ + GIVEN: + - A celery task is started via the consume folder + WHEN: + - Task finished execution + THEN: + - The task is marked as started + """ + self.util_call_before_task_publish_handler( + headers_to_use=self.HEADERS_CONSUME, + body_to_use=self.BODY_CONSUME, + ) task_postrun_handler( task_id=self.HEADERS_CONSUME["id"], diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 40c7a5c3b..0bdec4745 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -526,6 +526,10 @@ CELERY_RESULT_EXTENDED = True CELERY_RESULT_BACKEND = "django-db" CELERY_CACHE_BACKEND = "default" +# This allows types to stay types through a .delay +CELERY_TASK_SERIALIZER = "pickle" +CELERY_ACCEPT_CONTENT = ["application/x-python-serialize"] + CELERY_BEAT_SCHEDULE = { # Every ten minutes "Check all e-mail accounts": { From ce38e4ae081b9afb5be49bfdc56a67f7b5237c17 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Wed, 7 Dec 2022 19:25:53 -0800 Subject: [PATCH 3/4] Actually no need for pickle anyway --- src/paperless/settings.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 0bdec4745..db26ba3b3 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -526,9 +526,6 @@ CELERY_RESULT_EXTENDED = True CELERY_RESULT_BACKEND = "django-db" CELERY_CACHE_BACKEND = "default" -# This allows types to stay types through a .delay -CELERY_TASK_SERIALIZER = "pickle" -CELERY_ACCEPT_CONTENT = ["application/x-python-serialize"] CELERY_BEAT_SCHEDULE = { # Every ten minutes From b6dd36a439b3b8a2e101f8dd24367e20b8c39663 Mon Sep 17 00:00:00 2001 From: Trenton Holmes <797416+stumpylog@users.noreply.github.com> Date: Thu, 8 Dec 2022 18:48:43 -0800 Subject: [PATCH 4/4] Notes a TODO for a later library release to remove a workaround --- src/documents/tasks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/documents/tasks.py b/src/documents/tasks.py index f5bad665a..b5dc264fb 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -101,6 +101,8 @@ def consume_file( # Celery converts this to a string, but everything expects a datetime # Long term solution is to not use JSON for the serializer but pickle instead + # TODO: This will be resolved in kombu 5.3, expected with celery 5.3 + # More types will be retained through JSON encode/decode if override_created is not None and isinstance(override_created, str): try: override_created = datetime.fromisoformat(override_created)