From 3656c36965b018d3b7116726ee2a591f592224c9 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Fri, 21 Feb 2025 10:41:42 -0800 Subject: [PATCH] Testing out a switch to rich to remove tqdm --- .../management/commands/document_archiver.py | 4 +- .../management/commands/document_exporter.py | 36 +++- .../commands/document_fuzzy_match.py | 6 +- .../management/commands/document_importer.py | 154 ++++++++++-------- .../management/commands/document_renamer.py | 9 +- .../management/commands/document_retagger.py | 8 +- .../commands/document_thumbnails.py | 4 +- .../management/commands/prune_audit_logs.py | 9 +- src/documents/sanity_checker.py | 6 +- src/documents/tasks.py | 9 +- 10 files changed, 148 insertions(+), 97 deletions(-) diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index 1aa52117a..86e79b86c 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -1,10 +1,10 @@ import logging import multiprocessing -import tqdm from django import db from django.conf import settings from django.core.management.base import BaseCommand +from rich.progress import track from documents.management.commands.mixins import MultiProcessMixin from documents.management.commands.mixins import ProgressBarMixin @@ -81,7 +81,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): else: # pragma: no cover with multiprocessing.Pool(self.process_count) as pool: list( - tqdm.tqdm( + track( pool.imap_unordered( update_document_content_maybe_archive_file, document_ids, diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 6dc89479e..6f6fc2815 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -7,7 +7,6 @@ import time from pathlib import Path from typing import TYPE_CHECKING -import tqdm from allauth.mfa.models import Authenticator from allauth.socialaccount.models import SocialAccount from allauth.socialaccount.models import SocialApp @@ -25,6 +24,11 @@ from django.utils import timezone from filelock import FileLock from guardian.models import GroupObjectPermission from guardian.models import UserObjectPermission +from rich.progress import BarColumn +from rich.progress import MofNCompleteColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn if TYPE_CHECKING: from django.db.models import QuerySet @@ -229,8 +233,17 @@ class Command(CryptMixin, BaseCommand): try: # Prevent any ongoing changes in the documents - with FileLock(settings.MEDIA_LOCK): - self.dump() + with ( + FileLock(settings.MEDIA_LOCK), + Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + MofNCompleteColumn(), + disable=self.no_progress_bar, + ) as progress, + ): + self.dump(progress) # We've written everything to the temporary directory in this case, # now make an archive in the original target, with all files stored @@ -249,7 +262,7 @@ class Command(CryptMixin, BaseCommand): if self.zip_export and temp_dir is not None: temp_dir.cleanup() - def dump(self): + def dump(self, progress: Progress): # 1. Take a snapshot of what files exist in the current export folder for x in self.target.glob("**/*"): if x.is_file(): @@ -297,11 +310,17 @@ class Command(CryptMixin, BaseCommand): with transaction.atomic(): manifest_dict = {} + serialize_task = progress.add_task( + "Serializing database", + total=len(manifest_key_to_object_query), + ) + # Build an overall manifest for key, object_query in manifest_key_to_object_query.items(): manifest_dict[key] = json.loads( serializers.serialize("json", object_query), ) + progress.advance(serialize_task) self.encrypt_secret_fields(manifest_dict) @@ -313,12 +332,10 @@ class Command(CryptMixin, BaseCommand): } document_manifest = manifest_dict["documents"] + copy_task = progress.add_task("Copying files", total=len(document_manifest)) + # 3. Export files from each document - for index, document_dict in tqdm.tqdm( - enumerate(document_manifest), - total=len(document_manifest), - disable=self.no_progress_bar, - ): + for index, document_dict in enumerate(document_manifest): # 3.1. store files unencrypted document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED @@ -365,6 +382,7 @@ class Command(CryptMixin, BaseCommand): content, manifest_name, ) + progress.advance(copy_task) # These were exported already if self.split_manifest: diff --git a/src/documents/management/commands/document_fuzzy_match.py b/src/documents/management/commands/document_fuzzy_match.py index 9e01ff1b0..ef13eb491 100644 --- a/src/documents/management/commands/document_fuzzy_match.py +++ b/src/documents/management/commands/document_fuzzy_match.py @@ -3,9 +3,9 @@ import multiprocessing from typing import Final import rapidfuzz -import tqdm from django.core.management import BaseCommand from django.core.management import CommandError +from rich.progress import track from documents.management.commands.mixins import MultiProcessMixin from documents.management.commands.mixins import ProgressBarMixin @@ -105,12 +105,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): # Don't spin up a pool of 1 process if self.process_count == 1: results = [] - for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar): + for work in track(work_pkgs, disable=self.no_progress_bar): results.append(_process_and_match(work)) else: # pragma: no cover with multiprocessing.Pool(processes=self.process_count) as pool: results = list( - tqdm.tqdm( + track( pool.imap_unordered(_process_and_match, work_pkgs), total=len(work_pkgs), disable=self.no_progress_bar, diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index 9e3af47e7..cd953a149 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -5,7 +5,6 @@ from collections.abc import Generator from contextlib import contextmanager from pathlib import Path -import tqdm from django.conf import settings from django.contrib.auth.models import Permission from django.contrib.auth.models import User @@ -20,6 +19,7 @@ from django.db import transaction from django.db.models.signals import m2m_changed from django.db.models.signals import post_save from filelock import FileLock +from rich.progress import Progress from documents.file_handling import create_source_path_directory from documents.management.commands.mixins import CryptMixin @@ -138,7 +138,7 @@ class Command(CryptMixin, BaseCommand): pre_check_maybe_not_empty() pre_check_manifest_exists() - def load_manifest_files(self) -> None: + def load_manifest_files(self, progress: Progress) -> None: """ Loads manifest data from the various JSON files for parsing and loading the database """ @@ -148,10 +148,15 @@ class Command(CryptMixin, BaseCommand): self.manifest = json.load(infile) self.manifest_paths.append(main_manifest_path) + split_manifest_task = progress.add_task("Parsing split manifests") + for file in Path(self.source).glob("**/*-manifest.json"): + progress.update(split_manifest_task, visible=True) with file.open() as infile: self.manifest += json.load(infile) self.manifest_paths.append(file) + progress.advance(split_manifest_task) + progress.update(split_manifest_task, total=1, completed=1) def load_metadata(self) -> None: """ @@ -191,7 +196,7 @@ class Command(CryptMixin, BaseCommand): ), ) - def load_data_to_database(self) -> None: + def load_data_to_database(self, progress: Progress) -> None: """ As the name implies, loads data from the JSON file(s) into the database """ @@ -201,7 +206,7 @@ class Command(CryptMixin, BaseCommand): ContentType.objects.all().delete() Permission.objects.all().delete() for manifest_path in self.manifest_paths: - call_command("loaddata", manifest_path) + call_command("loaddata", "-v", "0", manifest_path) except (FieldDoesNotExist, DeserializationError, IntegrityError) as e: self.stdout.write(self.style.ERROR("Database import failed")) if ( @@ -234,55 +239,56 @@ class Command(CryptMixin, BaseCommand): self.manifest_paths = [] self.manifest = [] - self.pre_check() + with Progress(disable=self.no_progress_bar) as progress: + self.pre_check() - self.load_metadata() + self.load_metadata() - self.load_manifest_files() + self.load_manifest_files(progress) - self.check_manifest_validity() + self.check_manifest_validity(progress) - self.decrypt_secret_fields() + self.decrypt_secret_fields() - # see /src/documents/signals/handlers.py - with ( - disable_signal( - post_save, - receiver=update_filename_and_move_files, - sender=Document, - ), - disable_signal( - m2m_changed, - receiver=update_filename_and_move_files, - sender=Document.tags.through, - ), - disable_signal( - post_save, - receiver=update_filename_and_move_files, - sender=CustomFieldInstance, - ), - disable_signal( - post_save, - receiver=check_paths_and_prune_custom_fields, - sender=CustomField, - ), - ): - if settings.AUDIT_LOG_ENABLED: - auditlog.unregister(Document) - auditlog.unregister(Correspondent) - auditlog.unregister(Tag) - auditlog.unregister(DocumentType) - auditlog.unregister(Note) - auditlog.unregister(CustomField) - auditlog.unregister(CustomFieldInstance) + # see /src/documents/signals/handlers.py + with ( + disable_signal( + post_save, + receiver=update_filename_and_move_files, + sender=Document, + ), + disable_signal( + m2m_changed, + receiver=update_filename_and_move_files, + sender=Document.tags.through, + ), + disable_signal( + post_save, + receiver=update_filename_and_move_files, + sender=CustomFieldInstance, + ), + disable_signal( + post_save, + receiver=check_paths_and_prune_custom_fields, + sender=CustomField, + ), + ): + if settings.AUDIT_LOG_ENABLED: + auditlog.unregister(Document) + auditlog.unregister(Correspondent) + auditlog.unregister(Tag) + auditlog.unregister(DocumentType) + auditlog.unregister(Note) + auditlog.unregister(CustomField) + auditlog.unregister(CustomFieldInstance) - # Fill up the database with whatever is in the manifest - self.load_data_to_database() + # Fill up the database with whatever is in the manifest + self.load_data_to_database(progress) - if not self.data_only: - self._import_files_from_manifest() - else: - self.stdout.write(self.style.NOTICE("Data only import completed")) + if not self.data_only: + self._import_files_from_manifest(progress) + else: + self.stdout.write(self.style.NOTICE("Data only import completed")) self.stdout.write("Updating search index...") call_command( @@ -291,7 +297,7 @@ class Command(CryptMixin, BaseCommand): no_progress_bar=self.no_progress_bar, ) - def check_manifest_validity(self) -> None: + def check_manifest_validity(self, progress: Progress) -> None: """ Attempts to verify the manifest is valid. Namely checking the files referred to exist and the files can be read from @@ -335,45 +341,56 @@ class Command(CryptMixin, BaseCommand): f"Failed to read from archive file {doc_archive_path}", ) from e - self.stdout.write("Checking the manifest") + manifest_valid_task = progress.add_task( + "Checking validity", + total=None, + visible=not self.data_only, + ) + + # self.stdout.write("Checking the manifest") for record in self.manifest: # Only check if the document files exist if this is not data only # We don't care about documents for a data only import if not self.data_only and record["model"] == "documents.document": check_document_validity(record) + progress.advance(manifest_valid_task) + progress.update(manifest_valid_task, total=1, completed=1) - def _import_files_from_manifest(self) -> None: + def _import_files_from_manifest(self, progress: Progress) -> None: settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True) settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True) settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True) - self.stdout.write("Copy files into paperless...") + # self.stdout.write("Copy files into paperless...") manifest_documents = list( filter(lambda r: r["model"] == "documents.document", self.manifest), ) + copy_file_task = progress.add_task( + "Copying files", + total=len(manifest_documents), + ) + with FileLock(settings.MEDIA_LOCK): + for record in manifest_documents: + document = Document.objects.get(pk=record["pk"]) - for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar): - document = Document.objects.get(pk=record["pk"]) + doc_file = record[EXPORTER_FILE_NAME] + document_path = self.source / doc_file - doc_file = record[EXPORTER_FILE_NAME] - document_path = self.source / doc_file + if EXPORTER_THUMBNAIL_NAME in record: + thumb_file = record[EXPORTER_THUMBNAIL_NAME] + thumbnail_path = (self.source / thumb_file).resolve() + else: + thumbnail_path = None - if EXPORTER_THUMBNAIL_NAME in record: - thumb_file = record[EXPORTER_THUMBNAIL_NAME] - thumbnail_path = (self.source / thumb_file).resolve() - else: - thumbnail_path = None + if EXPORTER_ARCHIVE_NAME in record: + archive_file = record[EXPORTER_ARCHIVE_NAME] + archive_path = self.source / archive_file + else: + archive_path = None - if EXPORTER_ARCHIVE_NAME in record: - archive_file = record[EXPORTER_ARCHIVE_NAME] - archive_path = self.source / archive_file - else: - archive_path = None + document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED - - with FileLock(settings.MEDIA_LOCK): if Path(document.source_path).is_file(): raise FileExistsError(document.source_path) @@ -406,7 +423,8 @@ class Command(CryptMixin, BaseCommand): # archived files copy_file_with_basic_stats(archive_path, document.archive_path) - document.save() + document.save() + progress.advance(copy_file_task) def decrypt_secret_fields(self) -> None: """ diff --git a/src/documents/management/commands/document_renamer.py b/src/documents/management/commands/document_renamer.py index 2dfca217e..40302084f 100644 --- a/src/documents/management/commands/document_renamer.py +++ b/src/documents/management/commands/document_renamer.py @@ -1,8 +1,8 @@ import logging -import tqdm from django.core.management.base import BaseCommand from django.db.models.signals import post_save +from rich.progress import track from documents.management.commands.mixins import ProgressBarMixin from documents.models import Document @@ -17,9 +17,10 @@ class Command(ProgressBarMixin, BaseCommand): def handle(self, *args, **options): self.handle_progress_bar_mixin(**options) logging.getLogger().handlers[0].level = logging.ERROR - - for document in tqdm.tqdm( - Document.objects.all(), + qs = Document.objects.all() + for document in track( + qs, + total=qs.count(), disable=self.no_progress_bar, ): post_save.send(Document, instance=document, created=False) diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index 10bb54b71..f51b185ef 100644 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -1,7 +1,7 @@ import logging -import tqdm from django.core.management.base import BaseCommand +from rich.progress import track from documents.classifier import load_classifier from documents.management.commands.mixins import ProgressBarMixin @@ -84,7 +84,11 @@ class Command(ProgressBarMixin, BaseCommand): classifier = load_classifier() - for document in tqdm.tqdm(documents, disable=self.no_progress_bar): + for document in track( + documents, + total=documents.count(), + disable=self.no_progress_bar, + ): if options["correspondent"]: set_correspondent( sender=None, diff --git a/src/documents/management/commands/document_thumbnails.py b/src/documents/management/commands/document_thumbnails.py index d4653f0b3..e7bb4045d 100644 --- a/src/documents/management/commands/document_thumbnails.py +++ b/src/documents/management/commands/document_thumbnails.py @@ -2,9 +2,9 @@ import logging import multiprocessing import shutil -import tqdm from django import db from django.core.management.base import BaseCommand +from rich.progress import track from documents.management.commands.mixins import MultiProcessMixin from documents.management.commands.mixins import ProgressBarMixin @@ -76,7 +76,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): else: # pragma: no cover with multiprocessing.Pool(processes=self.process_count) as pool: list( - tqdm.tqdm( + track( pool.imap_unordered(_process_document, ids), total=len(ids), disable=self.no_progress_bar, diff --git a/src/documents/management/commands/prune_audit_logs.py b/src/documents/management/commands/prune_audit_logs.py index b49f4afc2..3907ab5a4 100644 --- a/src/documents/management/commands/prune_audit_logs.py +++ b/src/documents/management/commands/prune_audit_logs.py @@ -1,7 +1,7 @@ from auditlog.models import LogEntry from django.core.management.base import BaseCommand from django.db import transaction -from tqdm import tqdm +from rich.progress import track from documents.management.commands.mixins import ProgressBarMixin @@ -19,7 +19,10 @@ class Command(BaseCommand, ProgressBarMixin): def handle(self, **options): self.handle_progress_bar_mixin(**options) with transaction.atomic(): - for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar): + for log_entry in track( + LogEntry.objects.all(), + disable=self.no_progress_bar, + ): model_class = log_entry.content_type.model_class() # use global_objects for SoftDeleteModel objects = ( @@ -32,7 +35,7 @@ class Command(BaseCommand, ProgressBarMixin): and not objects.filter(pk=log_entry.object_id).exists() ): log_entry.delete() - tqdm.write( + self.stdout.write( self.style.NOTICE( f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}", ), diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py index 28d2024e7..dd47cfdc4 100644 --- a/src/documents/sanity_checker.py +++ b/src/documents/sanity_checker.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Final from django.conf import settings -from tqdm import tqdm +from rich.progress import track from documents.models import Document @@ -68,7 +68,9 @@ def check_sanity(*, progress=False) -> SanityCheckMessages: if lockfile in present_files: present_files.remove(lockfile) - for doc in tqdm(Document.global_objects.all(), disable=not progress): + qs = Document.global_objects.all() + + for doc in track(qs, total=qs.count(), disable=not progress): # Check sanity of the thumbnail thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve() if not thumbnail_path.exists() or not thumbnail_path.is_file(): diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 052def80f..8977e5af7 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -6,7 +6,6 @@ from datetime import timedelta from pathlib import Path from tempfile import TemporaryDirectory -import tqdm from celery import Task from celery import shared_task from django.conf import settings @@ -16,6 +15,7 @@ from django.db import transaction from django.db.models.signals import post_save from django.utils import timezone from filelock import FileLock +from rich.progress import track from whoosh.writing import AsyncWriter from documents import index @@ -69,7 +69,12 @@ def index_reindex(*, progress_bar_disable=False): ix = index.open_index(recreate=True) with AsyncWriter(ix) as writer: - for document in tqdm.tqdm(documents, disable=progress_bar_disable): + for document in track( + documents, + total=documents.count(), + description="Indexing...", + disable=progress_bar_disable, + ): index.update_document(writer, document)