Testing out a switch to rich to remove tqdm

This commit is contained in:
Trenton H 2025-02-21 10:41:42 -08:00
parent c122c60d3f
commit 3656c36965
10 changed files with 148 additions and 97 deletions

View File

@ -1,10 +1,10 @@
import logging
import multiprocessing
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from rich.progress import track
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@ -81,7 +81,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
list(
tqdm.tqdm(
track(
pool.imap_unordered(
update_document_content_maybe_archive_file,
document_ids,

View File

@ -7,7 +7,6 @@ import time
from pathlib import Path
from typing import TYPE_CHECKING
import tqdm
from allauth.mfa.models import Authenticator
from allauth.socialaccount.models import SocialAccount
from allauth.socialaccount.models import SocialApp
@ -25,6 +24,11 @@ from django.utils import timezone
from filelock import FileLock
from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
from rich.progress import BarColumn
from rich.progress import MofNCompleteColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
if TYPE_CHECKING:
from django.db.models import QuerySet
@ -229,8 +233,17 @@ class Command(CryptMixin, BaseCommand):
try:
# Prevent any ongoing changes in the documents
with FileLock(settings.MEDIA_LOCK):
self.dump()
with (
FileLock(settings.MEDIA_LOCK),
Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
MofNCompleteColumn(),
disable=self.no_progress_bar,
) as progress,
):
self.dump(progress)
# We've written everything to the temporary directory in this case,
# now make an archive in the original target, with all files stored
@ -249,7 +262,7 @@ class Command(CryptMixin, BaseCommand):
if self.zip_export and temp_dir is not None:
temp_dir.cleanup()
def dump(self):
def dump(self, progress: Progress):
# 1. Take a snapshot of what files exist in the current export folder
for x in self.target.glob("**/*"):
if x.is_file():
@ -297,11 +310,17 @@ class Command(CryptMixin, BaseCommand):
with transaction.atomic():
manifest_dict = {}
serialize_task = progress.add_task(
"Serializing database",
total=len(manifest_key_to_object_query),
)
# Build an overall manifest
for key, object_query in manifest_key_to_object_query.items():
manifest_dict[key] = json.loads(
serializers.serialize("json", object_query),
)
progress.advance(serialize_task)
self.encrypt_secret_fields(manifest_dict)
@ -313,12 +332,10 @@ class Command(CryptMixin, BaseCommand):
}
document_manifest = manifest_dict["documents"]
copy_task = progress.add_task("Copying files", total=len(document_manifest))
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
disable=self.no_progress_bar,
):
for index, document_dict in enumerate(document_manifest):
# 3.1. store files unencrypted
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED
@ -365,6 +382,7 @@ class Command(CryptMixin, BaseCommand):
content,
manifest_name,
)
progress.advance(copy_task)
# These were exported already
if self.split_manifest:

View File

@ -3,9 +3,9 @@ import multiprocessing
from typing import Final
import rapidfuzz
import tqdm
from django.core.management import BaseCommand
from django.core.management import CommandError
from rich.progress import track
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@ -105,12 +105,12 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
# Don't spin up a pool of 1 process
if self.process_count == 1:
results = []
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
for work in track(work_pkgs, disable=self.no_progress_bar):
results.append(_process_and_match(work))
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
results = list(
tqdm.tqdm(
track(
pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs),
disable=self.no_progress_bar,

View File

@ -5,7 +5,6 @@ from collections.abc import Generator
from contextlib import contextmanager
from pathlib import Path
import tqdm
from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
@ -20,6 +19,7 @@ from django.db import transaction
from django.db.models.signals import m2m_changed
from django.db.models.signals import post_save
from filelock import FileLock
from rich.progress import Progress
from documents.file_handling import create_source_path_directory
from documents.management.commands.mixins import CryptMixin
@ -138,7 +138,7 @@ class Command(CryptMixin, BaseCommand):
pre_check_maybe_not_empty()
pre_check_manifest_exists()
def load_manifest_files(self) -> None:
def load_manifest_files(self, progress: Progress) -> None:
"""
Loads manifest data from the various JSON files for parsing and loading the database
"""
@ -148,10 +148,15 @@ class Command(CryptMixin, BaseCommand):
self.manifest = json.load(infile)
self.manifest_paths.append(main_manifest_path)
split_manifest_task = progress.add_task("Parsing split manifests")
for file in Path(self.source).glob("**/*-manifest.json"):
progress.update(split_manifest_task, visible=True)
with file.open() as infile:
self.manifest += json.load(infile)
self.manifest_paths.append(file)
progress.advance(split_manifest_task)
progress.update(split_manifest_task, total=1, completed=1)
def load_metadata(self) -> None:
"""
@ -191,7 +196,7 @@ class Command(CryptMixin, BaseCommand):
),
)
def load_data_to_database(self) -> None:
def load_data_to_database(self, progress: Progress) -> None:
"""
As the name implies, loads data from the JSON file(s) into the database
"""
@ -201,7 +206,7 @@ class Command(CryptMixin, BaseCommand):
ContentType.objects.all().delete()
Permission.objects.all().delete()
for manifest_path in self.manifest_paths:
call_command("loaddata", manifest_path)
call_command("loaddata", "-v", "0", manifest_path)
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
self.stdout.write(self.style.ERROR("Database import failed"))
if (
@ -234,55 +239,56 @@ class Command(CryptMixin, BaseCommand):
self.manifest_paths = []
self.manifest = []
self.pre_check()
with Progress(disable=self.no_progress_bar) as progress:
self.pre_check()
self.load_metadata()
self.load_metadata()
self.load_manifest_files()
self.load_manifest_files(progress)
self.check_manifest_validity()
self.check_manifest_validity(progress)
self.decrypt_secret_fields()
self.decrypt_secret_fields()
# see /src/documents/signals/handlers.py
with (
disable_signal(
post_save,
receiver=update_filename_and_move_files,
sender=Document,
),
disable_signal(
m2m_changed,
receiver=update_filename_and_move_files,
sender=Document.tags.through,
),
disable_signal(
post_save,
receiver=update_filename_and_move_files,
sender=CustomFieldInstance,
),
disable_signal(
post_save,
receiver=check_paths_and_prune_custom_fields,
sender=CustomField,
),
):
if settings.AUDIT_LOG_ENABLED:
auditlog.unregister(Document)
auditlog.unregister(Correspondent)
auditlog.unregister(Tag)
auditlog.unregister(DocumentType)
auditlog.unregister(Note)
auditlog.unregister(CustomField)
auditlog.unregister(CustomFieldInstance)
# see /src/documents/signals/handlers.py
with (
disable_signal(
post_save,
receiver=update_filename_and_move_files,
sender=Document,
),
disable_signal(
m2m_changed,
receiver=update_filename_and_move_files,
sender=Document.tags.through,
),
disable_signal(
post_save,
receiver=update_filename_and_move_files,
sender=CustomFieldInstance,
),
disable_signal(
post_save,
receiver=check_paths_and_prune_custom_fields,
sender=CustomField,
),
):
if settings.AUDIT_LOG_ENABLED:
auditlog.unregister(Document)
auditlog.unregister(Correspondent)
auditlog.unregister(Tag)
auditlog.unregister(DocumentType)
auditlog.unregister(Note)
auditlog.unregister(CustomField)
auditlog.unregister(CustomFieldInstance)
# Fill up the database with whatever is in the manifest
self.load_data_to_database()
# Fill up the database with whatever is in the manifest
self.load_data_to_database(progress)
if not self.data_only:
self._import_files_from_manifest()
else:
self.stdout.write(self.style.NOTICE("Data only import completed"))
if not self.data_only:
self._import_files_from_manifest(progress)
else:
self.stdout.write(self.style.NOTICE("Data only import completed"))
self.stdout.write("Updating search index...")
call_command(
@ -291,7 +297,7 @@ class Command(CryptMixin, BaseCommand):
no_progress_bar=self.no_progress_bar,
)
def check_manifest_validity(self) -> None:
def check_manifest_validity(self, progress: Progress) -> None:
"""
Attempts to verify the manifest is valid. Namely checking the files
referred to exist and the files can be read from
@ -335,45 +341,56 @@ class Command(CryptMixin, BaseCommand):
f"Failed to read from archive file {doc_archive_path}",
) from e
self.stdout.write("Checking the manifest")
manifest_valid_task = progress.add_task(
"Checking validity",
total=None,
visible=not self.data_only,
)
# self.stdout.write("Checking the manifest")
for record in self.manifest:
# Only check if the document files exist if this is not data only
# We don't care about documents for a data only import
if not self.data_only and record["model"] == "documents.document":
check_document_validity(record)
progress.advance(manifest_valid_task)
progress.update(manifest_valid_task, total=1, completed=1)
def _import_files_from_manifest(self) -> None:
def _import_files_from_manifest(self, progress: Progress) -> None:
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
self.stdout.write("Copy files into paperless...")
# self.stdout.write("Copy files into paperless...")
manifest_documents = list(
filter(lambda r: r["model"] == "documents.document", self.manifest),
)
copy_file_task = progress.add_task(
"Copying files",
total=len(manifest_documents),
)
with FileLock(settings.MEDIA_LOCK):
for record in manifest_documents:
document = Document.objects.get(pk=record["pk"])
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
document = Document.objects.get(pk=record["pk"])
doc_file = record[EXPORTER_FILE_NAME]
document_path = self.source / doc_file
doc_file = record[EXPORTER_FILE_NAME]
document_path = self.source / doc_file
if EXPORTER_THUMBNAIL_NAME in record:
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = (self.source / thumb_file).resolve()
else:
thumbnail_path = None
if EXPORTER_THUMBNAIL_NAME in record:
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = (self.source / thumb_file).resolve()
else:
thumbnail_path = None
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
archive_path = self.source / archive_file
else:
archive_path = None
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
archive_path = self.source / archive_file
else:
archive_path = None
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
with FileLock(settings.MEDIA_LOCK):
if Path(document.source_path).is_file():
raise FileExistsError(document.source_path)
@ -406,7 +423,8 @@ class Command(CryptMixin, BaseCommand):
# archived files
copy_file_with_basic_stats(archive_path, document.archive_path)
document.save()
document.save()
progress.advance(copy_file_task)
def decrypt_secret_fields(self) -> None:
"""

View File

@ -1,8 +1,8 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
from rich.progress import track
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
@ -17,9 +17,10 @@ class Command(ProgressBarMixin, BaseCommand):
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(
Document.objects.all(),
qs = Document.objects.all()
for document in track(
qs,
total=qs.count(),
disable=self.no_progress_bar,
):
post_save.send(Document, instance=document, created=False)

View File

@ -1,7 +1,7 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from rich.progress import track
from documents.classifier import load_classifier
from documents.management.commands.mixins import ProgressBarMixin
@ -84,7 +84,11 @@ class Command(ProgressBarMixin, BaseCommand):
classifier = load_classifier()
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
for document in track(
documents,
total=documents.count(),
disable=self.no_progress_bar,
):
if options["correspondent"]:
set_correspondent(
sender=None,

View File

@ -2,9 +2,9 @@ import logging
import multiprocessing
import shutil
import tqdm
from django import db
from django.core.management.base import BaseCommand
from rich.progress import track
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@ -76,7 +76,7 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
list(
tqdm.tqdm(
track(
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=self.no_progress_bar,

View File

@ -1,7 +1,7 @@
from auditlog.models import LogEntry
from django.core.management.base import BaseCommand
from django.db import transaction
from tqdm import tqdm
from rich.progress import track
from documents.management.commands.mixins import ProgressBarMixin
@ -19,7 +19,10 @@ class Command(BaseCommand, ProgressBarMixin):
def handle(self, **options):
self.handle_progress_bar_mixin(**options)
with transaction.atomic():
for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar):
for log_entry in track(
LogEntry.objects.all(),
disable=self.no_progress_bar,
):
model_class = log_entry.content_type.model_class()
# use global_objects for SoftDeleteModel
objects = (
@ -32,7 +35,7 @@ class Command(BaseCommand, ProgressBarMixin):
and not objects.filter(pk=log_entry.object_id).exists()
):
log_entry.delete()
tqdm.write(
self.stdout.write(
self.style.NOTICE(
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
),

View File

@ -5,7 +5,7 @@ from pathlib import Path
from typing import Final
from django.conf import settings
from tqdm import tqdm
from rich.progress import track
from documents.models import Document
@ -68,7 +68,9 @@ def check_sanity(*, progress=False) -> SanityCheckMessages:
if lockfile in present_files:
present_files.remove(lockfile)
for doc in tqdm(Document.global_objects.all(), disable=not progress):
qs = Document.global_objects.all()
for doc in track(qs, total=qs.count(), disable=not progress):
# Check sanity of the thumbnail
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
if not thumbnail_path.exists() or not thumbnail_path.is_file():

View File

@ -6,7 +6,6 @@ from datetime import timedelta
from pathlib import Path
from tempfile import TemporaryDirectory
import tqdm
from celery import Task
from celery import shared_task
from django.conf import settings
@ -16,6 +15,7 @@ from django.db import transaction
from django.db.models.signals import post_save
from django.utils import timezone
from filelock import FileLock
from rich.progress import track
from whoosh.writing import AsyncWriter
from documents import index
@ -69,7 +69,12 @@ def index_reindex(*, progress_bar_disable=False):
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
for document in track(
documents,
total=documents.count(),
description="Indexing...",
disable=progress_bar_disable,
):
index.update_document(writer, document)