mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-02-09 23:49:29 -06:00
Replaces tqdm with rich
This commit is contained in:
@@ -1,10 +1,14 @@
|
||||
import logging
|
||||
import multiprocessing
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
@@ -75,20 +79,24 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
try:
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
if self.process_count == 1:
|
||||
for doc_id in document_ids:
|
||||
update_document_content_maybe_archive_file(doc_id)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(self.process_count) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(
|
||||
update_document_content_maybe_archive_file,
|
||||
document_ids,
|
||||
),
|
||||
total=len(document_ids),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task("Archiving documents", total=len(document_ids))
|
||||
if self.process_count == 1:
|
||||
for doc_id in document_ids:
|
||||
update_document_content_maybe_archive_file(doc_id)
|
||||
progress.update(task, advance=1)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(self.process_count) as pool:
|
||||
for _ in pool.imap_unordered(
|
||||
update_document_content_maybe_archive_file,
|
||||
document_ids,
|
||||
):
|
||||
progress.update(task, advance=1)
|
||||
except KeyboardInterrupt:
|
||||
self.stdout.write(self.style.NOTICE("Aborting..."))
|
||||
|
||||
@@ -6,7 +6,6 @@ import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import tqdm
|
||||
from allauth.mfa.models import Authenticator
|
||||
from allauth.socialaccount.models import SocialAccount
|
||||
from allauth.socialaccount.models import SocialApp
|
||||
@@ -24,6 +23,11 @@ from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from guardian.models import GroupObjectPermission
|
||||
from guardian.models import UserObjectPermission
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
@@ -309,12 +313,19 @@ class Command(CryptMixin, BaseCommand):
|
||||
document_manifest = manifest_dict["documents"]
|
||||
|
||||
# 3. Export files from each document
|
||||
for index, document_dict in tqdm.tqdm(
|
||||
enumerate(document_manifest),
|
||||
total=len(document_manifest),
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
document = document_map[document_dict["pk"]]
|
||||
) as progress:
|
||||
task = progress.add_task(
|
||||
"Exporting documents",
|
||||
total=len(document_manifest),
|
||||
)
|
||||
for index, document_dict in enumerate(document_manifest):
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
# 3.1. generate a unique filename
|
||||
base_name = self.generate_base_name(document)
|
||||
@@ -357,6 +368,7 @@ class Command(CryptMixin, BaseCommand):
|
||||
content,
|
||||
manifest_name,
|
||||
)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
# These were exported already
|
||||
if self.split_manifest:
|
||||
|
||||
@@ -3,9 +3,13 @@ import multiprocessing
|
||||
from typing import Final
|
||||
|
||||
import rapidfuzz
|
||||
import tqdm
|
||||
from django.core.management import BaseCommand
|
||||
from django.core.management import CommandError
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
@@ -106,19 +110,25 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
||||
|
||||
# Don't spin up a pool of 1 process
|
||||
if self.process_count == 1:
|
||||
results = []
|
||||
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
|
||||
results.append(_process_and_match(work))
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
results = list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(_process_and_match, work_pkgs),
|
||||
total=len(work_pkgs),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task("Fuzzy matching documents", total=len(work_pkgs))
|
||||
if self.process_count == 1:
|
||||
results = []
|
||||
for work in work_pkgs:
|
||||
results.append(_process_and_match(work))
|
||||
progress.update(task, advance=1)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
results = []
|
||||
for result in pool.imap_unordered(_process_and_match, work_pkgs):
|
||||
results.append(result)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
# Check results
|
||||
messages = []
|
||||
|
||||
@@ -8,7 +8,6 @@ from pathlib import Path
|
||||
from zipfile import ZipFile
|
||||
from zipfile import is_zipfile
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
@@ -23,6 +22,11 @@ from django.db import transaction
|
||||
from django.db.models.signals import m2m_changed
|
||||
from django.db.models.signals import post_save
|
||||
from filelock import FileLock
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.management.commands.mixins import CryptMixin
|
||||
@@ -365,8 +369,19 @@ class Command(CryptMixin, BaseCommand):
|
||||
filter(lambda r: r["model"] == "documents.document", self.manifest),
|
||||
)
|
||||
|
||||
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task(
|
||||
"Importing documents",
|
||||
total=len(manifest_documents),
|
||||
)
|
||||
for record in manifest_documents:
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
document_path = self.source / doc_file
|
||||
@@ -416,7 +431,8 @@ class Command(CryptMixin, BaseCommand):
|
||||
# archived files
|
||||
copy_file_with_basic_stats(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
document.save()
|
||||
progress.update(task, advance=1)
|
||||
|
||||
def decrypt_secret_fields(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models.signals import post_save
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.models import Document
|
||||
@@ -18,8 +22,15 @@ class Command(ProgressBarMixin, BaseCommand):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
for document in tqdm.tqdm(
|
||||
Document.objects.all(),
|
||||
documents = Document.objects.all()
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
post_save.send(Document, instance=document, created=False)
|
||||
) as progress:
|
||||
task = progress.add_task("Renaming documents", total=documents.count())
|
||||
for document in documents:
|
||||
post_save.send(Document, instance=document, created=False)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.classifier import load_classifier
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
@@ -84,53 +88,62 @@ class Command(ProgressBarMixin, BaseCommand):
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
|
||||
if options["correspondent"]:
|
||||
set_correspondent(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task("Retagging documents", total=documents.count())
|
||||
for document in documents:
|
||||
if options["correspondent"]:
|
||||
set_correspondent(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
|
||||
if options["document_type"]:
|
||||
set_document_type(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["document_type"]:
|
||||
set_document_type(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
|
||||
if options["tags"]:
|
||||
set_tags(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["storage_path"]:
|
||||
set_storage_path(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["tags"]:
|
||||
set_tags(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["storage_path"]:
|
||||
set_storage_path(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
@@ -2,9 +2,13 @@ import logging
|
||||
import multiprocessing
|
||||
import shutil
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.core.management.base import BaseCommand
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
@@ -70,15 +74,19 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
# with postgres.
|
||||
db.connections.close_all()
|
||||
|
||||
if self.process_count == 1:
|
||||
for doc_id in ids:
|
||||
_process_document(doc_id)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(_process_document, ids),
|
||||
total=len(ids),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task("Generating thumbnails", total=len(ids))
|
||||
if self.process_count == 1:
|
||||
for doc_id in ids:
|
||||
_process_document(doc_id)
|
||||
progress.update(task, advance=1)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
for _ in pool.imap_unordered(_process_document, ids):
|
||||
progress.update(task, advance=1)
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
from auditlog.models import LogEntry
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import transaction
|
||||
from tqdm import tqdm
|
||||
from rich.console import Console
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
|
||||
@@ -18,22 +23,37 @@ class Command(BaseCommand, ProgressBarMixin):
|
||||
|
||||
def handle(self, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
console = Console()
|
||||
with transaction.atomic():
|
||||
for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar):
|
||||
model_class = log_entry.content_type.model_class()
|
||||
# use global_objects for SoftDeleteModel
|
||||
objects = (
|
||||
model_class.global_objects
|
||||
if hasattr(model_class, "global_objects")
|
||||
else model_class.objects
|
||||
log_entries = LogEntry.objects.all()
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
console=console,
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task(
|
||||
"Pruning audit logs",
|
||||
total=log_entries.count(),
|
||||
)
|
||||
if (
|
||||
log_entry.object_id
|
||||
and not objects.filter(pk=log_entry.object_id).exists()
|
||||
):
|
||||
log_entry.delete()
|
||||
tqdm.write(
|
||||
self.style.NOTICE(
|
||||
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
|
||||
),
|
||||
for log_entry in log_entries:
|
||||
model_class = log_entry.content_type.model_class()
|
||||
# use global_objects for SoftDeleteModel
|
||||
objects = (
|
||||
model_class.global_objects
|
||||
if hasattr(model_class, "global_objects")
|
||||
else model_class.objects
|
||||
)
|
||||
if (
|
||||
log_entry.object_id
|
||||
and not objects.filter(pk=log_entry.object_id).exists()
|
||||
):
|
||||
log_entry.delete()
|
||||
console.print(
|
||||
self.style.NOTICE(
|
||||
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
|
||||
),
|
||||
)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
@@ -8,7 +8,11 @@ from typing import Final
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from tqdm import tqdm
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
@@ -92,76 +96,99 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
|
||||
if logo_file in present_files:
|
||||
present_files.remove(logo_file)
|
||||
|
||||
for doc in tqdm(Document.global_objects.all(), disable=not progress):
|
||||
# Check sanity of the thumbnail
|
||||
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
|
||||
if not thumbnail_path.exists() or not thumbnail_path.is_file():
|
||||
messages.error(doc.pk, "Thumbnail of document does not exist.")
|
||||
else:
|
||||
if thumbnail_path in present_files:
|
||||
present_files.remove(thumbnail_path)
|
||||
try:
|
||||
_ = thumbnail_path.read_bytes()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
|
||||
|
||||
# Check sanity of the original file
|
||||
# TODO: extract method
|
||||
source_path: Final[Path] = Path(doc.source_path).resolve()
|
||||
if not source_path.exists() or not source_path.is_file():
|
||||
messages.error(doc.pk, "Original of document does not exist.")
|
||||
else:
|
||||
if source_path in present_files:
|
||||
present_files.remove(source_path)
|
||||
try:
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
||||
documents = Document.global_objects.all()
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=not progress,
|
||||
) as progress_bar:
|
||||
task = progress_bar.add_task(
|
||||
"Checking document sanity",
|
||||
total=documents.count(),
|
||||
)
|
||||
for doc in documents:
|
||||
# Check sanity of the thumbnail
|
||||
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
|
||||
if not thumbnail_path.exists() or not thumbnail_path.is_file():
|
||||
messages.error(doc.pk, "Thumbnail of document does not exist.")
|
||||
else:
|
||||
if checksum != doc.checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}.",
|
||||
)
|
||||
|
||||
# Check sanity of the archive file.
|
||||
if doc.archive_checksum is not None and doc.archive_filename is None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file checksum, but no archive filename.",
|
||||
)
|
||||
elif doc.archive_checksum is None and doc.archive_filename is not None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file, but its checksum is missing.",
|
||||
)
|
||||
elif doc.has_archive_version:
|
||||
archive_path: Final[Path] = Path(doc.archive_path).resolve()
|
||||
if not archive_path.exists() or not archive_path.is_file():
|
||||
messages.error(doc.pk, "Archived version of document does not exist.")
|
||||
else:
|
||||
if archive_path in present_files:
|
||||
present_files.remove(archive_path)
|
||||
if thumbnail_path in present_files:
|
||||
present_files.remove(thumbnail_path)
|
||||
try:
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
_ = thumbnail_path.read_bytes()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Cannot read archive file of document : {e}",
|
||||
f"Cannot read thumbnail file of document: {e}",
|
||||
)
|
||||
|
||||
# Check sanity of the original file
|
||||
# TODO: extract method
|
||||
source_path: Final[Path] = Path(doc.source_path).resolve()
|
||||
if not source_path.exists() or not source_path.is_file():
|
||||
messages.error(doc.pk, "Original of document does not exist.")
|
||||
else:
|
||||
if source_path in present_files:
|
||||
present_files.remove(source_path)
|
||||
try:
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Cannot read original file of document: {e}",
|
||||
)
|
||||
else:
|
||||
if checksum != doc.archive_checksum:
|
||||
if checksum != doc.checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch of archived document. "
|
||||
f"Stored: {doc.archive_checksum}, "
|
||||
f"actual: {checksum}.",
|
||||
"Checksum mismatch. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}.",
|
||||
)
|
||||
|
||||
# other document checks
|
||||
if not doc.content:
|
||||
messages.info(doc.pk, "Document contains no OCR data")
|
||||
# Check sanity of the archive file.
|
||||
if doc.archive_checksum is not None and doc.archive_filename is None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file checksum, but no archive filename.",
|
||||
)
|
||||
elif doc.archive_checksum is None and doc.archive_filename is not None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file, but its checksum is missing.",
|
||||
)
|
||||
elif doc.has_archive_version:
|
||||
archive_path: Final[Path] = Path(doc.archive_path).resolve()
|
||||
if not archive_path.exists() or not archive_path.is_file():
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Archived version of document does not exist.",
|
||||
)
|
||||
else:
|
||||
if archive_path in present_files:
|
||||
present_files.remove(archive_path)
|
||||
try:
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Cannot read archive file of document : {e}",
|
||||
)
|
||||
else:
|
||||
if checksum != doc.archive_checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch of archived document. "
|
||||
f"Stored: {doc.archive_checksum}, "
|
||||
f"actual: {checksum}.",
|
||||
)
|
||||
|
||||
# other document checks
|
||||
if not doc.content:
|
||||
messages.info(doc.pk, "Document contains no OCR data")
|
||||
|
||||
progress_bar.update(task, advance=1)
|
||||
|
||||
for extra_file in present_files:
|
||||
messages.warning(None, f"Orphaned file in media dir: {extra_file}")
|
||||
|
||||
@@ -8,7 +8,6 @@ from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from tempfile import mkstemp
|
||||
|
||||
import tqdm
|
||||
from celery import Task
|
||||
from celery import shared_task
|
||||
from celery import states
|
||||
@@ -19,6 +18,11 @@ from django.db import transaction
|
||||
from django.db.models.signals import post_save
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
@@ -83,9 +87,20 @@ def index_reindex(*, progress_bar_disable=False) -> None:
|
||||
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
with (
|
||||
AsyncWriter(ix) as writer,
|
||||
Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=progress_bar_disable,
|
||||
) as progress,
|
||||
):
|
||||
task = progress.add_task("Reindexing documents", total=documents.count())
|
||||
for document in documents:
|
||||
index.update_document(writer, document)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
|
||||
@shared_task
|
||||
|
||||
@@ -5,7 +5,6 @@ from pathlib import Path
|
||||
|
||||
import faiss
|
||||
import llama_index.core.settings as llama_settings
|
||||
import tqdm
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
@@ -22,6 +21,11 @@ from llama_index.core.storage.docstore import SimpleDocumentStore
|
||||
from llama_index.core.storage.index_store import SimpleIndexStore
|
||||
from llama_index.core.text_splitter import TokenTextSplitter
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
@@ -176,9 +180,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
||||
embed_model = get_embedding_model()
|
||||
llama_settings.Settings.embed_model = embed_model
|
||||
storage_context = get_or_create_storage_context(rebuild=True)
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
document_nodes = build_document_node(document)
|
||||
nodes.extend(document_nodes)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=progress_bar_disable,
|
||||
) as progress:
|
||||
task = progress.add_task("Building document nodes", total=documents.count())
|
||||
for document in documents:
|
||||
document_nodes = build_document_node(document)
|
||||
nodes.extend(document_nodes)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
index = VectorStoreIndex(
|
||||
nodes=nodes,
|
||||
@@ -196,23 +209,33 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
||||
for node in index.docstore.get_nodes(all_node_ids)
|
||||
}
|
||||
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
doc_id = str(document.id)
|
||||
document_modified = document.modified.isoformat()
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=progress_bar_disable,
|
||||
) as progress:
|
||||
task = progress.add_task("Updating index nodes", total=documents.count())
|
||||
for document in documents:
|
||||
doc_id = str(document.id)
|
||||
document_modified = document.modified.isoformat()
|
||||
|
||||
if doc_id in existing_nodes:
|
||||
node = existing_nodes[doc_id]
|
||||
node_modified = node.metadata.get("modified")
|
||||
if doc_id in existing_nodes:
|
||||
node = existing_nodes[doc_id]
|
||||
node_modified = node.metadata.get("modified")
|
||||
|
||||
if node_modified == document_modified:
|
||||
continue
|
||||
if node_modified == document_modified:
|
||||
progress.update(task, advance=1)
|
||||
continue
|
||||
|
||||
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
|
||||
index.docstore.delete_document(node.node_id)
|
||||
nodes.extend(build_document_node(document))
|
||||
else:
|
||||
# New document, add it
|
||||
nodes.extend(build_document_node(document))
|
||||
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
|
||||
index.docstore.delete_document(node.node_id)
|
||||
nodes.extend(build_document_node(document))
|
||||
else:
|
||||
# New document, add it
|
||||
nodes.extend(build_document_node(document))
|
||||
progress.update(task, advance=1)
|
||||
|
||||
if nodes:
|
||||
msg = "LLM index updated successfully."
|
||||
|
||||
Reference in New Issue
Block a user