diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index 1aa52117a..f11dbb029 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -1,10 +1,14 @@ import logging import multiprocessing -import tqdm from django import db from django.conf import settings from django.core.management.base import BaseCommand +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.management.commands.mixins import MultiProcessMixin from documents.management.commands.mixins import ProgressBarMixin @@ -75,20 +79,24 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): try: logging.getLogger().handlers[0].level = logging.ERROR - if self.process_count == 1: - for doc_id in document_ids: - update_document_content_maybe_archive_file(doc_id) - else: # pragma: no cover - with multiprocessing.Pool(self.process_count) as pool: - list( - tqdm.tqdm( - pool.imap_unordered( - update_document_content_maybe_archive_file, - document_ids, - ), - total=len(document_ids), - disable=self.no_progress_bar, - ), - ) + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=self.no_progress_bar, + ) as progress: + task = progress.add_task("Archiving documents", total=len(document_ids)) + if self.process_count == 1: + for doc_id in document_ids: + update_document_content_maybe_archive_file(doc_id) + progress.update(task, advance=1) + else: # pragma: no cover + with multiprocessing.Pool(self.process_count) as pool: + for _ in pool.imap_unordered( + update_document_content_maybe_archive_file, + document_ids, + ): + progress.update(task, advance=1) except KeyboardInterrupt: self.stdout.write(self.style.NOTICE("Aborting...")) diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index bd962efc4..dd024c7c0 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -6,7 +6,6 @@ import tempfile from pathlib import Path from typing import TYPE_CHECKING -import tqdm from allauth.mfa.models import Authenticator from allauth.socialaccount.models import SocialAccount from allauth.socialaccount.models import SocialApp @@ -24,6 +23,11 @@ from django.utils import timezone from filelock import FileLock from guardian.models import GroupObjectPermission from guardian.models import UserObjectPermission +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn if TYPE_CHECKING: from django.db.models import QuerySet @@ -309,12 +313,19 @@ class Command(CryptMixin, BaseCommand): document_manifest = manifest_dict["documents"] # 3. Export files from each document - for index, document_dict in tqdm.tqdm( - enumerate(document_manifest), - total=len(document_manifest), + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), disable=self.no_progress_bar, - ): - document = document_map[document_dict["pk"]] + ) as progress: + task = progress.add_task( + "Exporting documents", + total=len(document_manifest), + ) + for index, document_dict in enumerate(document_manifest): + document = document_map[document_dict["pk"]] # 3.1. generate a unique filename base_name = self.generate_base_name(document) @@ -357,6 +368,7 @@ class Command(CryptMixin, BaseCommand): content, manifest_name, ) + progress.update(task, advance=1) # These were exported already if self.split_manifest: diff --git a/src/documents/management/commands/document_fuzzy_match.py b/src/documents/management/commands/document_fuzzy_match.py index 4ecdf6d01..285347828 100644 --- a/src/documents/management/commands/document_fuzzy_match.py +++ b/src/documents/management/commands/document_fuzzy_match.py @@ -3,9 +3,13 @@ import multiprocessing from typing import Final import rapidfuzz -import tqdm from django.core.management import BaseCommand from django.core.management import CommandError +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.management.commands.mixins import MultiProcessMixin from documents.management.commands.mixins import ProgressBarMixin @@ -106,19 +110,25 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): work_pkgs.append(_WorkPackage(first_doc, second_doc)) # Don't spin up a pool of 1 process - if self.process_count == 1: - results = [] - for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar): - results.append(_process_and_match(work)) - else: # pragma: no cover - with multiprocessing.Pool(processes=self.process_count) as pool: - results = list( - tqdm.tqdm( - pool.imap_unordered(_process_and_match, work_pkgs), - total=len(work_pkgs), - disable=self.no_progress_bar, - ), - ) + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=self.no_progress_bar, + ) as progress: + task = progress.add_task("Fuzzy matching documents", total=len(work_pkgs)) + if self.process_count == 1: + results = [] + for work in work_pkgs: + results.append(_process_and_match(work)) + progress.update(task, advance=1) + else: # pragma: no cover + with multiprocessing.Pool(processes=self.process_count) as pool: + results = [] + for result in pool.imap_unordered(_process_and_match, work_pkgs): + results.append(result) + progress.update(task, advance=1) # Check results messages = [] diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index 5cd743590..d66f744f6 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -8,7 +8,6 @@ from pathlib import Path from zipfile import ZipFile from zipfile import is_zipfile -import tqdm from django.conf import settings from django.contrib.auth.models import Permission from django.contrib.auth.models import User @@ -23,6 +22,11 @@ from django.db import transaction from django.db.models.signals import m2m_changed from django.db.models.signals import post_save from filelock import FileLock +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.file_handling import create_source_path_directory from documents.management.commands.mixins import CryptMixin @@ -365,8 +369,19 @@ class Command(CryptMixin, BaseCommand): filter(lambda r: r["model"] == "documents.document", self.manifest), ) - for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar): - document = Document.objects.get(pk=record["pk"]) + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=self.no_progress_bar, + ) as progress: + task = progress.add_task( + "Importing documents", + total=len(manifest_documents), + ) + for record in manifest_documents: + document = Document.objects.get(pk=record["pk"]) doc_file = record[EXPORTER_FILE_NAME] document_path = self.source / doc_file @@ -416,7 +431,8 @@ class Command(CryptMixin, BaseCommand): # archived files copy_file_with_basic_stats(archive_path, document.archive_path) - document.save() + document.save() + progress.update(task, advance=1) def decrypt_secret_fields(self) -> None: """ diff --git a/src/documents/management/commands/document_renamer.py b/src/documents/management/commands/document_renamer.py index 2dfca217e..7495b6203 100644 --- a/src/documents/management/commands/document_renamer.py +++ b/src/documents/management/commands/document_renamer.py @@ -1,8 +1,12 @@ import logging -import tqdm from django.core.management.base import BaseCommand from django.db.models.signals import post_save +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.management.commands.mixins import ProgressBarMixin from documents.models import Document @@ -18,8 +22,15 @@ class Command(ProgressBarMixin, BaseCommand): self.handle_progress_bar_mixin(**options) logging.getLogger().handlers[0].level = logging.ERROR - for document in tqdm.tqdm( - Document.objects.all(), + documents = Document.objects.all() + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), disable=self.no_progress_bar, - ): - post_save.send(Document, instance=document, created=False) + ) as progress: + task = progress.add_task("Renaming documents", total=documents.count()) + for document in documents: + post_save.send(Document, instance=document, created=False) + progress.update(task, advance=1) diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index 10bb54b71..88615a481 100644 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -1,7 +1,11 @@ import logging -import tqdm from django.core.management.base import BaseCommand +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.classifier import load_classifier from documents.management.commands.mixins import ProgressBarMixin @@ -84,53 +88,62 @@ class Command(ProgressBarMixin, BaseCommand): classifier = load_classifier() - for document in tqdm.tqdm(documents, disable=self.no_progress_bar): - if options["correspondent"]: - set_correspondent( - sender=None, - document=document, - classifier=classifier, - replace=options["overwrite"], - use_first=options["use_first"], - suggest=options["suggest"], - base_url=options["base_url"], - stdout=self.stdout, - style_func=self.style, - ) + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=self.no_progress_bar, + ) as progress: + task = progress.add_task("Retagging documents", total=documents.count()) + for document in documents: + if options["correspondent"]: + set_correspondent( + sender=None, + document=document, + classifier=classifier, + replace=options["overwrite"], + use_first=options["use_first"], + suggest=options["suggest"], + base_url=options["base_url"], + stdout=self.stdout, + style_func=self.style, + ) - if options["document_type"]: - set_document_type( - sender=None, - document=document, - classifier=classifier, - replace=options["overwrite"], - use_first=options["use_first"], - suggest=options["suggest"], - base_url=options["base_url"], - stdout=self.stdout, - style_func=self.style, - ) + if options["document_type"]: + set_document_type( + sender=None, + document=document, + classifier=classifier, + replace=options["overwrite"], + use_first=options["use_first"], + suggest=options["suggest"], + base_url=options["base_url"], + stdout=self.stdout, + style_func=self.style, + ) - if options["tags"]: - set_tags( - sender=None, - document=document, - classifier=classifier, - replace=options["overwrite"], - suggest=options["suggest"], - base_url=options["base_url"], - stdout=self.stdout, - style_func=self.style, - ) - if options["storage_path"]: - set_storage_path( - sender=None, - document=document, - classifier=classifier, - replace=options["overwrite"], - use_first=options["use_first"], - suggest=options["suggest"], - base_url=options["base_url"], - stdout=self.stdout, - style_func=self.style, - ) + if options["tags"]: + set_tags( + sender=None, + document=document, + classifier=classifier, + replace=options["overwrite"], + suggest=options["suggest"], + base_url=options["base_url"], + stdout=self.stdout, + style_func=self.style, + ) + if options["storage_path"]: + set_storage_path( + sender=None, + document=document, + classifier=classifier, + replace=options["overwrite"], + use_first=options["use_first"], + suggest=options["suggest"], + base_url=options["base_url"], + stdout=self.stdout, + style_func=self.style, + ) + progress.update(task, advance=1) diff --git a/src/documents/management/commands/document_thumbnails.py b/src/documents/management/commands/document_thumbnails.py index e50c837d3..fefcd61e0 100644 --- a/src/documents/management/commands/document_thumbnails.py +++ b/src/documents/management/commands/document_thumbnails.py @@ -2,9 +2,13 @@ import logging import multiprocessing import shutil -import tqdm from django import db from django.core.management.base import BaseCommand +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.management.commands.mixins import MultiProcessMixin from documents.management.commands.mixins import ProgressBarMixin @@ -70,15 +74,19 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand): # with postgres. db.connections.close_all() - if self.process_count == 1: - for doc_id in ids: - _process_document(doc_id) - else: # pragma: no cover - with multiprocessing.Pool(processes=self.process_count) as pool: - list( - tqdm.tqdm( - pool.imap_unordered(_process_document, ids), - total=len(ids), - disable=self.no_progress_bar, - ), - ) + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=self.no_progress_bar, + ) as progress: + task = progress.add_task("Generating thumbnails", total=len(ids)) + if self.process_count == 1: + for doc_id in ids: + _process_document(doc_id) + progress.update(task, advance=1) + else: # pragma: no cover + with multiprocessing.Pool(processes=self.process_count) as pool: + for _ in pool.imap_unordered(_process_document, ids): + progress.update(task, advance=1) diff --git a/src/documents/management/commands/prune_audit_logs.py b/src/documents/management/commands/prune_audit_logs.py index b49f4afc2..206c1f1d2 100644 --- a/src/documents/management/commands/prune_audit_logs.py +++ b/src/documents/management/commands/prune_audit_logs.py @@ -1,7 +1,12 @@ from auditlog.models import LogEntry from django.core.management.base import BaseCommand from django.db import transaction -from tqdm import tqdm +from rich.console import Console +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.management.commands.mixins import ProgressBarMixin @@ -18,22 +23,37 @@ class Command(BaseCommand, ProgressBarMixin): def handle(self, **options): self.handle_progress_bar_mixin(**options) + console = Console() with transaction.atomic(): - for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar): - model_class = log_entry.content_type.model_class() - # use global_objects for SoftDeleteModel - objects = ( - model_class.global_objects - if hasattr(model_class, "global_objects") - else model_class.objects + log_entries = LogEntry.objects.all() + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + console=console, + disable=self.no_progress_bar, + ) as progress: + task = progress.add_task( + "Pruning audit logs", + total=log_entries.count(), ) - if ( - log_entry.object_id - and not objects.filter(pk=log_entry.object_id).exists() - ): - log_entry.delete() - tqdm.write( - self.style.NOTICE( - f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}", - ), + for log_entry in log_entries: + model_class = log_entry.content_type.model_class() + # use global_objects for SoftDeleteModel + objects = ( + model_class.global_objects + if hasattr(model_class, "global_objects") + else model_class.objects ) + if ( + log_entry.object_id + and not objects.filter(pk=log_entry.object_id).exists() + ): + log_entry.delete() + console.print( + self.style.NOTICE( + f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}", + ), + ) + progress.update(task, advance=1) diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py index 08763d937..6fb2009c8 100644 --- a/src/documents/sanity_checker.py +++ b/src/documents/sanity_checker.py @@ -8,7 +8,11 @@ from typing import Final from celery import states from django.conf import settings from django.utils import timezone -from tqdm import tqdm +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.models import Document from documents.models import PaperlessTask @@ -92,76 +96,99 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages: if logo_file in present_files: present_files.remove(logo_file) - for doc in tqdm(Document.global_objects.all(), disable=not progress): - # Check sanity of the thumbnail - thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve() - if not thumbnail_path.exists() or not thumbnail_path.is_file(): - messages.error(doc.pk, "Thumbnail of document does not exist.") - else: - if thumbnail_path in present_files: - present_files.remove(thumbnail_path) - try: - _ = thumbnail_path.read_bytes() - except OSError as e: - messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}") - - # Check sanity of the original file - # TODO: extract method - source_path: Final[Path] = Path(doc.source_path).resolve() - if not source_path.exists() or not source_path.is_file(): - messages.error(doc.pk, "Original of document does not exist.") - else: - if source_path in present_files: - present_files.remove(source_path) - try: - checksum = hashlib.md5(source_path.read_bytes()).hexdigest() - except OSError as e: - messages.error(doc.pk, f"Cannot read original file of document: {e}") + documents = Document.global_objects.all() + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=not progress, + ) as progress_bar: + task = progress_bar.add_task( + "Checking document sanity", + total=documents.count(), + ) + for doc in documents: + # Check sanity of the thumbnail + thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve() + if not thumbnail_path.exists() or not thumbnail_path.is_file(): + messages.error(doc.pk, "Thumbnail of document does not exist.") else: - if checksum != doc.checksum: - messages.error( - doc.pk, - "Checksum mismatch. " - f"Stored: {doc.checksum}, actual: {checksum}.", - ) - - # Check sanity of the archive file. - if doc.archive_checksum is not None and doc.archive_filename is None: - messages.error( - doc.pk, - "Document has an archive file checksum, but no archive filename.", - ) - elif doc.archive_checksum is None and doc.archive_filename is not None: - messages.error( - doc.pk, - "Document has an archive file, but its checksum is missing.", - ) - elif doc.has_archive_version: - archive_path: Final[Path] = Path(doc.archive_path).resolve() - if not archive_path.exists() or not archive_path.is_file(): - messages.error(doc.pk, "Archived version of document does not exist.") - else: - if archive_path in present_files: - present_files.remove(archive_path) + if thumbnail_path in present_files: + present_files.remove(thumbnail_path) try: - checksum = hashlib.md5(archive_path.read_bytes()).hexdigest() + _ = thumbnail_path.read_bytes() except OSError as e: messages.error( doc.pk, - f"Cannot read archive file of document : {e}", + f"Cannot read thumbnail file of document: {e}", + ) + + # Check sanity of the original file + # TODO: extract method + source_path: Final[Path] = Path(doc.source_path).resolve() + if not source_path.exists() or not source_path.is_file(): + messages.error(doc.pk, "Original of document does not exist.") + else: + if source_path in present_files: + present_files.remove(source_path) + try: + checksum = hashlib.md5(source_path.read_bytes()).hexdigest() + except OSError as e: + messages.error( + doc.pk, + f"Cannot read original file of document: {e}", ) else: - if checksum != doc.archive_checksum: + if checksum != doc.checksum: messages.error( doc.pk, - "Checksum mismatch of archived document. " - f"Stored: {doc.archive_checksum}, " - f"actual: {checksum}.", + "Checksum mismatch. " + f"Stored: {doc.checksum}, actual: {checksum}.", ) - # other document checks - if not doc.content: - messages.info(doc.pk, "Document contains no OCR data") + # Check sanity of the archive file. + if doc.archive_checksum is not None and doc.archive_filename is None: + messages.error( + doc.pk, + "Document has an archive file checksum, but no archive filename.", + ) + elif doc.archive_checksum is None and doc.archive_filename is not None: + messages.error( + doc.pk, + "Document has an archive file, but its checksum is missing.", + ) + elif doc.has_archive_version: + archive_path: Final[Path] = Path(doc.archive_path).resolve() + if not archive_path.exists() or not archive_path.is_file(): + messages.error( + doc.pk, + "Archived version of document does not exist.", + ) + else: + if archive_path in present_files: + present_files.remove(archive_path) + try: + checksum = hashlib.md5(archive_path.read_bytes()).hexdigest() + except OSError as e: + messages.error( + doc.pk, + f"Cannot read archive file of document : {e}", + ) + else: + if checksum != doc.archive_checksum: + messages.error( + doc.pk, + "Checksum mismatch of archived document. " + f"Stored: {doc.archive_checksum}, " + f"actual: {checksum}.", + ) + + # other document checks + if not doc.content: + messages.info(doc.pk, "Document contains no OCR data") + + progress_bar.update(task, advance=1) for extra_file in present_files: messages.warning(None, f"Orphaned file in media dir: {extra_file}") diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 1e8b35891..43ddadf06 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -8,7 +8,6 @@ from pathlib import Path from tempfile import TemporaryDirectory from tempfile import mkstemp -import tqdm from celery import Task from celery import shared_task from celery import states @@ -19,6 +18,11 @@ from django.db import transaction from django.db.models.signals import post_save from django.utils import timezone from filelock import FileLock +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from whoosh.writing import AsyncWriter from documents import index @@ -83,9 +87,20 @@ def index_reindex(*, progress_bar_disable=False) -> None: ix = index.open_index(recreate=True) - with AsyncWriter(ix) as writer: - for document in tqdm.tqdm(documents, disable=progress_bar_disable): + with ( + AsyncWriter(ix) as writer, + Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=progress_bar_disable, + ) as progress, + ): + task = progress.add_task("Reindexing documents", total=documents.count()) + for document in documents: index.update_document(writer, document) + progress.update(task, advance=1) @shared_task diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py index 654c56f3b..dfe106adb 100644 --- a/src/paperless_ai/indexing.py +++ b/src/paperless_ai/indexing.py @@ -5,7 +5,6 @@ from pathlib import Path import faiss import llama_index.core.settings as llama_settings -import tqdm from celery import states from django.conf import settings from django.utils import timezone @@ -22,6 +21,11 @@ from llama_index.core.storage.docstore import SimpleDocumentStore from llama_index.core.storage.index_store import SimpleIndexStore from llama_index.core.text_splitter import TokenTextSplitter from llama_index.vector_stores.faiss import FaissVectorStore +from rich.progress import BarColumn +from rich.progress import Progress +from rich.progress import TaskProgressColumn +from rich.progress import TextColumn +from rich.progress import TimeRemainingColumn from documents.models import Document from documents.models import PaperlessTask @@ -176,9 +180,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str: embed_model = get_embedding_model() llama_settings.Settings.embed_model = embed_model storage_context = get_or_create_storage_context(rebuild=True) - for document in tqdm.tqdm(documents, disable=progress_bar_disable): - document_nodes = build_document_node(document) - nodes.extend(document_nodes) + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=progress_bar_disable, + ) as progress: + task = progress.add_task("Building document nodes", total=documents.count()) + for document in documents: + document_nodes = build_document_node(document) + nodes.extend(document_nodes) + progress.update(task, advance=1) index = VectorStoreIndex( nodes=nodes, @@ -196,23 +209,33 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str: for node in index.docstore.get_nodes(all_node_ids) } - for document in tqdm.tqdm(documents, disable=progress_bar_disable): - doc_id = str(document.id) - document_modified = document.modified.isoformat() + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeRemainingColumn(), + disable=progress_bar_disable, + ) as progress: + task = progress.add_task("Updating index nodes", total=documents.count()) + for document in documents: + doc_id = str(document.id) + document_modified = document.modified.isoformat() - if doc_id in existing_nodes: - node = existing_nodes[doc_id] - node_modified = node.metadata.get("modified") + if doc_id in existing_nodes: + node = existing_nodes[doc_id] + node_modified = node.metadata.get("modified") - if node_modified == document_modified: - continue + if node_modified == document_modified: + progress.update(task, advance=1) + continue - # Again, delete from docstore, FAISS IndexFlatL2 are append-only - index.docstore.delete_document(node.node_id) - nodes.extend(build_document_node(document)) - else: - # New document, add it - nodes.extend(build_document_node(document)) + # Again, delete from docstore, FAISS IndexFlatL2 are append-only + index.docstore.delete_document(node.node_id) + nodes.extend(build_document_node(document)) + else: + # New document, add it + nodes.extend(build_document_node(document)) + progress.update(task, advance=1) if nodes: msg = "LLM index updated successfully."