Replaces tqdm with rich

This commit is contained in:
Trenton H
2026-02-09 15:52:52 -08:00
parent c4ed4e7f36
commit e0b45539a6
11 changed files with 368 additions and 205 deletions

View File

@@ -1,10 +1,14 @@
import logging
import multiprocessing
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@@ -75,20 +79,24 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
try:
logging.getLogger().handlers[0].level = logging.ERROR
if self.process_count == 1:
for doc_id in document_ids:
update_document_content_maybe_archive_file(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(
update_document_content_maybe_archive_file,
document_ids,
),
total=len(document_ids),
disable=self.no_progress_bar,
),
)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task("Archiving documents", total=len(document_ids))
if self.process_count == 1:
for doc_id in document_ids:
update_document_content_maybe_archive_file(doc_id)
progress.update(task, advance=1)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
for _ in pool.imap_unordered(
update_document_content_maybe_archive_file,
document_ids,
):
progress.update(task, advance=1)
except KeyboardInterrupt:
self.stdout.write(self.style.NOTICE("Aborting..."))

View File

@@ -6,7 +6,6 @@ import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
import tqdm
from allauth.mfa.models import Authenticator
from allauth.socialaccount.models import SocialAccount
from allauth.socialaccount.models import SocialApp
@@ -24,6 +23,11 @@ from django.utils import timezone
from filelock import FileLock
from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
if TYPE_CHECKING:
from django.db.models import QuerySet
@@ -309,12 +313,19 @@ class Command(CryptMixin, BaseCommand):
document_manifest = manifest_dict["documents"]
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
):
document = document_map[document_dict["pk"]]
) as progress:
task = progress.add_task(
"Exporting documents",
total=len(document_manifest),
)
for index, document_dict in enumerate(document_manifest):
document = document_map[document_dict["pk"]]
# 3.1. generate a unique filename
base_name = self.generate_base_name(document)
@@ -357,6 +368,7 @@ class Command(CryptMixin, BaseCommand):
content,
manifest_name,
)
progress.update(task, advance=1)
# These were exported already
if self.split_manifest:

View File

@@ -3,9 +3,13 @@ import multiprocessing
from typing import Final
import rapidfuzz
import tqdm
from django.core.management import BaseCommand
from django.core.management import CommandError
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@@ -106,19 +110,25 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
work_pkgs.append(_WorkPackage(first_doc, second_doc))
# Don't spin up a pool of 1 process
if self.process_count == 1:
results = []
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
results.append(_process_and_match(work))
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
results = list(
tqdm.tqdm(
pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs),
disable=self.no_progress_bar,
),
)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task("Fuzzy matching documents", total=len(work_pkgs))
if self.process_count == 1:
results = []
for work in work_pkgs:
results.append(_process_and_match(work))
progress.update(task, advance=1)
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
results = []
for result in pool.imap_unordered(_process_and_match, work_pkgs):
results.append(result)
progress.update(task, advance=1)
# Check results
messages = []

View File

@@ -8,7 +8,6 @@ from pathlib import Path
from zipfile import ZipFile
from zipfile import is_zipfile
import tqdm
from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
@@ -23,6 +22,11 @@ from django.db import transaction
from django.db.models.signals import m2m_changed
from django.db.models.signals import post_save
from filelock import FileLock
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.file_handling import create_source_path_directory
from documents.management.commands.mixins import CryptMixin
@@ -365,8 +369,19 @@ class Command(CryptMixin, BaseCommand):
filter(lambda r: r["model"] == "documents.document", self.manifest),
)
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
document = Document.objects.get(pk=record["pk"])
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task(
"Importing documents",
total=len(manifest_documents),
)
for record in manifest_documents:
document = Document.objects.get(pk=record["pk"])
doc_file = record[EXPORTER_FILE_NAME]
document_path = self.source / doc_file
@@ -416,7 +431,8 @@ class Command(CryptMixin, BaseCommand):
# archived files
copy_file_with_basic_stats(archive_path, document.archive_path)
document.save()
document.save()
progress.update(task, advance=1)
def decrypt_secret_fields(self) -> None:
"""

View File

@@ -1,8 +1,12 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
@@ -18,8 +22,15 @@ class Command(ProgressBarMixin, BaseCommand):
self.handle_progress_bar_mixin(**options)
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(
Document.objects.all(),
documents = Document.objects.all()
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
):
post_save.send(Document, instance=document, created=False)
) as progress:
task = progress.add_task("Renaming documents", total=documents.count())
for document in documents:
post_save.send(Document, instance=document, created=False)
progress.update(task, advance=1)

View File

@@ -1,7 +1,11 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.classifier import load_classifier
from documents.management.commands.mixins import ProgressBarMixin
@@ -84,53 +88,62 @@ class Command(ProgressBarMixin, BaseCommand):
classifier = load_classifier()
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
if options["correspondent"]:
set_correspondent(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task("Retagging documents", total=documents.count())
for document in documents:
if options["correspondent"]:
set_correspondent(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["document_type"]:
set_document_type(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["document_type"]:
set_document_type(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["tags"]:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["storage_path"]:
set_storage_path(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["tags"]:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["storage_path"]:
set_storage_path(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
progress.update(task, advance=1)

View File

@@ -2,9 +2,13 @@ import logging
import multiprocessing
import shutil
import tqdm
from django import db
from django.core.management.base import BaseCommand
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@@ -70,15 +74,19 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
# with postgres.
db.connections.close_all()
if self.process_count == 1:
for doc_id in ids:
_process_document(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=self.no_progress_bar,
),
)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task("Generating thumbnails", total=len(ids))
if self.process_count == 1:
for doc_id in ids:
_process_document(doc_id)
progress.update(task, advance=1)
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
for _ in pool.imap_unordered(_process_document, ids):
progress.update(task, advance=1)

View File

@@ -1,7 +1,12 @@
from auditlog.models import LogEntry
from django.core.management.base import BaseCommand
from django.db import transaction
from tqdm import tqdm
from rich.console import Console
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import ProgressBarMixin
@@ -18,22 +23,37 @@ class Command(BaseCommand, ProgressBarMixin):
def handle(self, **options):
self.handle_progress_bar_mixin(**options)
console = Console()
with transaction.atomic():
for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar):
model_class = log_entry.content_type.model_class()
# use global_objects for SoftDeleteModel
objects = (
model_class.global_objects
if hasattr(model_class, "global_objects")
else model_class.objects
log_entries = LogEntry.objects.all()
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
console=console,
disable=self.no_progress_bar,
) as progress:
task = progress.add_task(
"Pruning audit logs",
total=log_entries.count(),
)
if (
log_entry.object_id
and not objects.filter(pk=log_entry.object_id).exists()
):
log_entry.delete()
tqdm.write(
self.style.NOTICE(
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
),
for log_entry in log_entries:
model_class = log_entry.content_type.model_class()
# use global_objects for SoftDeleteModel
objects = (
model_class.global_objects
if hasattr(model_class, "global_objects")
else model_class.objects
)
if (
log_entry.object_id
and not objects.filter(pk=log_entry.object_id).exists()
):
log_entry.delete()
console.print(
self.style.NOTICE(
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
),
)
progress.update(task, advance=1)

View File

@@ -8,7 +8,11 @@ from typing import Final
from celery import states
from django.conf import settings
from django.utils import timezone
from tqdm import tqdm
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.models import Document
from documents.models import PaperlessTask
@@ -92,76 +96,99 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
if logo_file in present_files:
present_files.remove(logo_file)
for doc in tqdm(Document.global_objects.all(), disable=not progress):
# Check sanity of the thumbnail
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
if not thumbnail_path.exists() or not thumbnail_path.is_file():
messages.error(doc.pk, "Thumbnail of document does not exist.")
else:
if thumbnail_path in present_files:
present_files.remove(thumbnail_path)
try:
_ = thumbnail_path.read_bytes()
except OSError as e:
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
# Check sanity of the original file
# TODO: extract method
source_path: Final[Path] = Path(doc.source_path).resolve()
if not source_path.exists() or not source_path.is_file():
messages.error(doc.pk, "Original of document does not exist.")
else:
if source_path in present_files:
present_files.remove(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(doc.pk, f"Cannot read original file of document: {e}")
documents = Document.global_objects.all()
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=not progress,
) as progress_bar:
task = progress_bar.add_task(
"Checking document sanity",
total=documents.count(),
)
for doc in documents:
# Check sanity of the thumbnail
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
if not thumbnail_path.exists() or not thumbnail_path.is_file():
messages.error(doc.pk, "Thumbnail of document does not exist.")
else:
if checksum != doc.checksum:
messages.error(
doc.pk,
"Checksum mismatch. "
f"Stored: {doc.checksum}, actual: {checksum}.",
)
# Check sanity of the archive file.
if doc.archive_checksum is not None and doc.archive_filename is None:
messages.error(
doc.pk,
"Document has an archive file checksum, but no archive filename.",
)
elif doc.archive_checksum is None and doc.archive_filename is not None:
messages.error(
doc.pk,
"Document has an archive file, but its checksum is missing.",
)
elif doc.has_archive_version:
archive_path: Final[Path] = Path(doc.archive_path).resolve()
if not archive_path.exists() or not archive_path.is_file():
messages.error(doc.pk, "Archived version of document does not exist.")
else:
if archive_path in present_files:
present_files.remove(archive_path)
if thumbnail_path in present_files:
present_files.remove(thumbnail_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
_ = thumbnail_path.read_bytes()
except OSError as e:
messages.error(
doc.pk,
f"Cannot read archive file of document : {e}",
f"Cannot read thumbnail file of document: {e}",
)
# Check sanity of the original file
# TODO: extract method
source_path: Final[Path] = Path(doc.source_path).resolve()
if not source_path.exists() or not source_path.is_file():
messages.error(doc.pk, "Original of document does not exist.")
else:
if source_path in present_files:
present_files.remove(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(
doc.pk,
f"Cannot read original file of document: {e}",
)
else:
if checksum != doc.archive_checksum:
if checksum != doc.checksum:
messages.error(
doc.pk,
"Checksum mismatch of archived document. "
f"Stored: {doc.archive_checksum}, "
f"actual: {checksum}.",
"Checksum mismatch. "
f"Stored: {doc.checksum}, actual: {checksum}.",
)
# other document checks
if not doc.content:
messages.info(doc.pk, "Document contains no OCR data")
# Check sanity of the archive file.
if doc.archive_checksum is not None and doc.archive_filename is None:
messages.error(
doc.pk,
"Document has an archive file checksum, but no archive filename.",
)
elif doc.archive_checksum is None and doc.archive_filename is not None:
messages.error(
doc.pk,
"Document has an archive file, but its checksum is missing.",
)
elif doc.has_archive_version:
archive_path: Final[Path] = Path(doc.archive_path).resolve()
if not archive_path.exists() or not archive_path.is_file():
messages.error(
doc.pk,
"Archived version of document does not exist.",
)
else:
if archive_path in present_files:
present_files.remove(archive_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(
doc.pk,
f"Cannot read archive file of document : {e}",
)
else:
if checksum != doc.archive_checksum:
messages.error(
doc.pk,
"Checksum mismatch of archived document. "
f"Stored: {doc.archive_checksum}, "
f"actual: {checksum}.",
)
# other document checks
if not doc.content:
messages.info(doc.pk, "Document contains no OCR data")
progress_bar.update(task, advance=1)
for extra_file in present_files:
messages.warning(None, f"Orphaned file in media dir: {extra_file}")

View File

@@ -8,7 +8,6 @@ from pathlib import Path
from tempfile import TemporaryDirectory
from tempfile import mkstemp
import tqdm
from celery import Task
from celery import shared_task
from celery import states
@@ -19,6 +18,11 @@ from django.db import transaction
from django.db.models.signals import post_save
from django.utils import timezone
from filelock import FileLock
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from whoosh.writing import AsyncWriter
from documents import index
@@ -83,9 +87,20 @@ def index_reindex(*, progress_bar_disable=False) -> None:
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
with (
AsyncWriter(ix) as writer,
Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=progress_bar_disable,
) as progress,
):
task = progress.add_task("Reindexing documents", total=documents.count())
for document in documents:
index.update_document(writer, document)
progress.update(task, advance=1)
@shared_task

View File

@@ -5,7 +5,6 @@ from pathlib import Path
import faiss
import llama_index.core.settings as llama_settings
import tqdm
from celery import states
from django.conf import settings
from django.utils import timezone
@@ -22,6 +21,11 @@ from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.models import Document
from documents.models import PaperlessTask
@@ -176,9 +180,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
embed_model = get_embedding_model()
llama_settings.Settings.embed_model = embed_model
storage_context = get_or_create_storage_context(rebuild=True)
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
document_nodes = build_document_node(document)
nodes.extend(document_nodes)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=progress_bar_disable,
) as progress:
task = progress.add_task("Building document nodes", total=documents.count())
for document in documents:
document_nodes = build_document_node(document)
nodes.extend(document_nodes)
progress.update(task, advance=1)
index = VectorStoreIndex(
nodes=nodes,
@@ -196,23 +209,33 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
for node in index.docstore.get_nodes(all_node_ids)
}
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
doc_id = str(document.id)
document_modified = document.modified.isoformat()
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=progress_bar_disable,
) as progress:
task = progress.add_task("Updating index nodes", total=documents.count())
for document in documents:
doc_id = str(document.id)
document_modified = document.modified.isoformat()
if doc_id in existing_nodes:
node = existing_nodes[doc_id]
node_modified = node.metadata.get("modified")
if doc_id in existing_nodes:
node = existing_nodes[doc_id]
node_modified = node.metadata.get("modified")
if node_modified == document_modified:
continue
if node_modified == document_modified:
progress.update(task, advance=1)
continue
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
index.docstore.delete_document(node.node_id)
nodes.extend(build_document_node(document))
else:
# New document, add it
nodes.extend(build_document_node(document))
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
index.docstore.delete_document(node.node_id)
nodes.extend(build_document_node(document))
else:
# New document, add it
nodes.extend(build_document_node(document))
progress.update(task, advance=1)
if nodes:
msg = "LLM index updated successfully."