Compare commits

..

5 Commits

Author SHA1 Message Date
Trenton H
59a25f8601 Updates this baseline too 2026-02-09 16:44:40 -08:00
Trenton H
5950f22e87 Typing 2026-02-09 16:32:39 -08:00
Trenton H
40ff58ad39 Wrong indentation level 2026-02-09 16:19:46 -08:00
Trenton H
914362224c Drops (direct) dependency on TQDM and removes its typing entry too 2026-02-09 15:54:29 -08:00
Trenton H
e0b45539a6 Replaces tqdm with rich 2026-02-09 15:53:27 -08:00
15 changed files with 454 additions and 302 deletions

View File

@@ -277,8 +277,6 @@ src/documents/management/commands/document_exporter.py:0: error: Skipping analyz
src/documents/management/commands/document_exporter.py:0: error: Skipping analyzing "auditlog.models": module is installed, but missing library stubs or py.typed marker [import-untyped]
src/documents/management/commands/document_fuzzy_match.py:0: error: Function is missing a type annotation [no-untyped-def]
src/documents/management/commands/document_fuzzy_match.py:0: error: Function is missing a type annotation [no-untyped-def]
src/documents/management/commands/document_importer.py:0: error: Argument 1 to "create_source_path_directory" has incompatible type "Path | None"; expected "Path" [arg-type]
src/documents/management/commands/document_importer.py:0: error: Argument 2 to "copy_file_with_basic_stats" has incompatible type "Path | None"; expected "Path | str" [arg-type]
src/documents/management/commands/document_importer.py:0: error: Attribute "version" already defined on line 0 [no-redef]
src/documents/management/commands/document_importer.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
src/documents/management/commands/document_importer.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]

View File

@@ -77,7 +77,6 @@ dependencies = [
"setproctitle~=1.3.4",
"tika-client~=0.10.0",
"torch~=2.10.0",
"tqdm~=4.67.1",
"watchfiles>=1.1.1",
"whitenoise~=6.11",
"whoosh-reloaded>=2.7.5",
@@ -150,7 +149,6 @@ typing = [
"types-pytz",
"types-redis",
"types-setuptools",
"types-tqdm",
]
[tool.uv]

View File

@@ -1,10 +1,14 @@
import logging
import multiprocessing
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@@ -75,20 +79,24 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
try:
logging.getLogger().handlers[0].level = logging.ERROR
if self.process_count == 1:
for doc_id in document_ids:
update_document_content_maybe_archive_file(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(
update_document_content_maybe_archive_file,
document_ids,
),
total=len(document_ids),
disable=self.no_progress_bar,
),
)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task("Archiving documents", total=len(document_ids))
if self.process_count == 1:
for doc_id in document_ids:
update_document_content_maybe_archive_file(doc_id)
progress.update(task, advance=1)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
for _ in pool.imap_unordered(
update_document_content_maybe_archive_file,
document_ids,
):
progress.update(task, advance=1)
except KeyboardInterrupt:
self.stdout.write(self.style.NOTICE("Aborting..."))

View File

@@ -6,7 +6,6 @@ import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
import tqdm
from allauth.mfa.models import Authenticator
from allauth.socialaccount.models import SocialAccount
from allauth.socialaccount.models import SocialApp
@@ -24,6 +23,11 @@ from django.utils import timezone
from filelock import FileLock
from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
if TYPE_CHECKING:
from django.db.models import QuerySet
@@ -309,54 +313,64 @@ class Command(CryptMixin, BaseCommand):
document_manifest = manifest_dict["documents"]
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
):
document = document_map[document_dict["pk"]]
# 3.1. generate a unique filename
base_name = self.generate_base_name(document)
# 3.2. write filenames into manifest
original_target, thumbnail_target, archive_target = (
self.generate_document_targets(document, base_name, document_dict)
) as progress:
task = progress.add_task(
"Exporting documents",
total=len(document_manifest),
)
for index, document_dict in enumerate(document_manifest):
document = document_map[document_dict["pk"]]
# 3.3. write files to target folder
if not self.data_only:
self.copy_document_files(
document,
original_target,
thumbnail_target,
archive_target,
# 3.1. generate a unique filename
base_name = self.generate_base_name(document)
# 3.2. write filenames into manifest
original_target, thumbnail_target, archive_target = (
self.generate_document_targets(document, base_name, document_dict)
)
if self.split_manifest:
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
if self.use_folder_prefix:
manifest_name = Path("json") / manifest_name
manifest_name = (self.target / manifest_name).resolve()
manifest_name.parent.mkdir(parents=True, exist_ok=True)
content = [document_manifest[index]]
content += list(
filter(
lambda d: d["fields"]["document"] == document_dict["pk"],
manifest_dict["notes"],
),
)
content += list(
filter(
lambda d: d["fields"]["document"] == document_dict["pk"],
manifest_dict["custom_field_instances"],
),
)
# 3.3. write files to target folder
if not self.data_only:
self.copy_document_files(
document,
original_target,
thumbnail_target,
archive_target,
)
self.check_and_write_json(
content,
manifest_name,
)
if self.split_manifest:
manifest_name = base_name.with_name(
f"{base_name.stem}-manifest.json",
)
if self.use_folder_prefix:
manifest_name = Path("json") / manifest_name
manifest_name = (self.target / manifest_name).resolve()
manifest_name.parent.mkdir(parents=True, exist_ok=True)
content = [document_manifest[index]]
content += list(
filter(
lambda d: d["fields"]["document"] == document_dict["pk"],
manifest_dict["notes"],
),
)
content += list(
filter(
lambda d: d["fields"]["document"] == document_dict["pk"],
manifest_dict["custom_field_instances"],
),
)
self.check_and_write_json(
content,
manifest_name,
)
progress.update(task, advance=1)
# These were exported already
if self.split_manifest:

View File

@@ -3,9 +3,13 @@ import multiprocessing
from typing import Final
import rapidfuzz
import tqdm
from django.core.management import BaseCommand
from django.core.management import CommandError
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@@ -106,19 +110,25 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
work_pkgs.append(_WorkPackage(first_doc, second_doc))
# Don't spin up a pool of 1 process
if self.process_count == 1:
results = []
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
results.append(_process_and_match(work))
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
results = list(
tqdm.tqdm(
pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs),
disable=self.no_progress_bar,
),
)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task("Fuzzy matching documents", total=len(work_pkgs))
if self.process_count == 1:
results = []
for work in work_pkgs:
results.append(_process_and_match(work))
progress.update(task, advance=1)
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
results = []
for result in pool.imap_unordered(_process_and_match, work_pkgs):
results.append(result)
progress.update(task, advance=1)
# Check results
messages = []

View File

@@ -5,10 +5,10 @@ import tempfile
from collections.abc import Generator
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING
from zipfile import ZipFile
from zipfile import is_zipfile
import tqdm
from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
@@ -23,6 +23,11 @@ from django.db import transaction
from django.db.models.signals import m2m_changed
from django.db.models.signals import post_save
from filelock import FileLock
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.file_handling import create_source_path_directory
from documents.management.commands.mixins import CryptMixin
@@ -365,58 +370,72 @@ class Command(CryptMixin, BaseCommand):
filter(lambda r: r["model"] == "documents.document", self.manifest),
)
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
document = Document.objects.get(pk=record["pk"])
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task(
"Importing documents",
total=len(manifest_documents),
)
for record in manifest_documents:
document = Document.objects.get(pk=record["pk"])
doc_file = record[EXPORTER_FILE_NAME]
document_path = self.source / doc_file
doc_file = record[EXPORTER_FILE_NAME]
document_path = self.source / doc_file
if EXPORTER_THUMBNAIL_NAME in record:
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = (self.source / thumb_file).resolve()
else:
thumbnail_path = None
if EXPORTER_THUMBNAIL_NAME in record:
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = (self.source / thumb_file).resolve()
else:
thumbnail_path = None
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
archive_path = self.source / archive_file
else:
archive_path = None
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
archive_path = self.source / archive_file
else:
archive_path = None
with FileLock(settings.MEDIA_LOCK):
if Path(document.source_path).is_file():
raise FileExistsError(document.source_path)
with FileLock(settings.MEDIA_LOCK):
if Path(document.source_path).is_file():
raise FileExistsError(document.source_path)
create_source_path_directory(document.source_path)
create_source_path_directory(document.source_path)
copy_file_with_basic_stats(document_path, document.source_path)
copy_file_with_basic_stats(document_path, document.source_path)
if thumbnail_path:
if thumbnail_path.suffix in {".png", ".PNG"}:
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{thumbnail_path}[0]",
output_file=str(document.thumbnail_path),
)
else:
copy_file_with_basic_stats(
thumbnail_path,
document.thumbnail_path,
)
if thumbnail_path:
if thumbnail_path.suffix in {".png", ".PNG"}:
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{thumbnail_path}[0]",
output_file=str(document.thumbnail_path),
)
else:
copy_file_with_basic_stats(
thumbnail_path,
document.thumbnail_path,
)
if archive_path:
create_source_path_directory(document.archive_path)
# TODO: this assumes that the export is valid and
# archive_filename is present on all documents with
# archived files
copy_file_with_basic_stats(archive_path, document.archive_path)
if archive_path:
if TYPE_CHECKING:
assert document.archive_path is not None
create_source_path_directory(document.archive_path)
# TODO: this assumes that the export is valid and
# archive_filename is present on all documents with
# archived files
copy_file_with_basic_stats(archive_path, document.archive_path)
document.save()
document.save()
progress.update(task, advance=1)
def decrypt_secret_fields(self) -> None:
"""

View File

@@ -1,8 +1,12 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
@@ -18,8 +22,15 @@ class Command(ProgressBarMixin, BaseCommand):
self.handle_progress_bar_mixin(**options)
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(
Document.objects.all(),
documents = Document.objects.all()
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
):
post_save.send(Document, instance=document, created=False)
) as progress:
task = progress.add_task("Renaming documents", total=documents.count())
for document in documents:
post_save.send(Document, instance=document, created=False)
progress.update(task, advance=1)

View File

@@ -1,7 +1,11 @@
import logging
import tqdm
from django.core.management.base import BaseCommand
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.classifier import load_classifier
from documents.management.commands.mixins import ProgressBarMixin
@@ -84,53 +88,62 @@ class Command(ProgressBarMixin, BaseCommand):
classifier = load_classifier()
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
if options["correspondent"]:
set_correspondent(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task("Retagging documents", total=documents.count())
for document in documents:
if options["correspondent"]:
set_correspondent(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["document_type"]:
set_document_type(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["document_type"]:
set_document_type(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["tags"]:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["storage_path"]:
set_storage_path(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["tags"]:
set_tags(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
if options["storage_path"]:
set_storage_path(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)
progress.update(task, advance=1)

View File

@@ -2,9 +2,13 @@ import logging
import multiprocessing
import shutil
import tqdm
from django import db
from django.core.management.base import BaseCommand
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
@@ -70,15 +74,19 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
# with postgres.
db.connections.close_all()
if self.process_count == 1:
for doc_id in ids:
_process_document(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=self.no_progress_bar,
),
)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=self.no_progress_bar,
) as progress:
task = progress.add_task("Generating thumbnails", total=len(ids))
if self.process_count == 1:
for doc_id in ids:
_process_document(doc_id)
progress.update(task, advance=1)
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
for _ in pool.imap_unordered(_process_document, ids):
progress.update(task, advance=1)

View File

@@ -1,7 +1,12 @@
from auditlog.models import LogEntry
from django.core.management.base import BaseCommand
from django.db import transaction
from tqdm import tqdm
from rich.console import Console
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.management.commands.mixins import ProgressBarMixin
@@ -18,22 +23,37 @@ class Command(BaseCommand, ProgressBarMixin):
def handle(self, **options):
self.handle_progress_bar_mixin(**options)
console = Console()
with transaction.atomic():
for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar):
model_class = log_entry.content_type.model_class()
# use global_objects for SoftDeleteModel
objects = (
model_class.global_objects
if hasattr(model_class, "global_objects")
else model_class.objects
log_entries = LogEntry.objects.all()
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
console=console,
disable=self.no_progress_bar,
) as progress:
task = progress.add_task(
"Pruning audit logs",
total=log_entries.count(),
)
if (
log_entry.object_id
and not objects.filter(pk=log_entry.object_id).exists()
):
log_entry.delete()
tqdm.write(
self.style.NOTICE(
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
),
for log_entry in log_entries:
model_class = log_entry.content_type.model_class()
# use global_objects for SoftDeleteModel
objects = (
model_class.global_objects
if hasattr(model_class, "global_objects")
else model_class.objects
)
if (
log_entry.object_id
and not objects.filter(pk=log_entry.object_id).exists()
):
log_entry.delete()
console.print(
self.style.NOTICE(
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
),
)
progress.update(task, advance=1)

View File

@@ -8,7 +8,11 @@ from typing import Final
from celery import states
from django.conf import settings
from django.utils import timezone
from tqdm import tqdm
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.models import Document
from documents.models import PaperlessTask
@@ -92,76 +96,99 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
if logo_file in present_files:
present_files.remove(logo_file)
for doc in tqdm(Document.global_objects.all(), disable=not progress):
# Check sanity of the thumbnail
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
if not thumbnail_path.exists() or not thumbnail_path.is_file():
messages.error(doc.pk, "Thumbnail of document does not exist.")
else:
if thumbnail_path in present_files:
present_files.remove(thumbnail_path)
try:
_ = thumbnail_path.read_bytes()
except OSError as e:
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
# Check sanity of the original file
# TODO: extract method
source_path: Final[Path] = Path(doc.source_path).resolve()
if not source_path.exists() or not source_path.is_file():
messages.error(doc.pk, "Original of document does not exist.")
else:
if source_path in present_files:
present_files.remove(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(doc.pk, f"Cannot read original file of document: {e}")
documents = Document.global_objects.all()
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=not progress,
) as progress_bar:
task = progress_bar.add_task(
"Checking document sanity",
total=documents.count(),
)
for doc in documents:
# Check sanity of the thumbnail
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
if not thumbnail_path.exists() or not thumbnail_path.is_file():
messages.error(doc.pk, "Thumbnail of document does not exist.")
else:
if checksum != doc.checksum:
messages.error(
doc.pk,
"Checksum mismatch. "
f"Stored: {doc.checksum}, actual: {checksum}.",
)
# Check sanity of the archive file.
if doc.archive_checksum is not None and doc.archive_filename is None:
messages.error(
doc.pk,
"Document has an archive file checksum, but no archive filename.",
)
elif doc.archive_checksum is None and doc.archive_filename is not None:
messages.error(
doc.pk,
"Document has an archive file, but its checksum is missing.",
)
elif doc.has_archive_version:
archive_path: Final[Path] = Path(doc.archive_path).resolve()
if not archive_path.exists() or not archive_path.is_file():
messages.error(doc.pk, "Archived version of document does not exist.")
else:
if archive_path in present_files:
present_files.remove(archive_path)
if thumbnail_path in present_files:
present_files.remove(thumbnail_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
_ = thumbnail_path.read_bytes()
except OSError as e:
messages.error(
doc.pk,
f"Cannot read archive file of document : {e}",
f"Cannot read thumbnail file of document: {e}",
)
# Check sanity of the original file
# TODO: extract method
source_path: Final[Path] = Path(doc.source_path).resolve()
if not source_path.exists() or not source_path.is_file():
messages.error(doc.pk, "Original of document does not exist.")
else:
if source_path in present_files:
present_files.remove(source_path)
try:
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(
doc.pk,
f"Cannot read original file of document: {e}",
)
else:
if checksum != doc.archive_checksum:
if checksum != doc.checksum:
messages.error(
doc.pk,
"Checksum mismatch of archived document. "
f"Stored: {doc.archive_checksum}, "
f"actual: {checksum}.",
"Checksum mismatch. "
f"Stored: {doc.checksum}, actual: {checksum}.",
)
# other document checks
if not doc.content:
messages.info(doc.pk, "Document contains no OCR data")
# Check sanity of the archive file.
if doc.archive_checksum is not None and doc.archive_filename is None:
messages.error(
doc.pk,
"Document has an archive file checksum, but no archive filename.",
)
elif doc.archive_checksum is None and doc.archive_filename is not None:
messages.error(
doc.pk,
"Document has an archive file, but its checksum is missing.",
)
elif doc.has_archive_version:
archive_path: Final[Path] = Path(doc.archive_path).resolve()
if not archive_path.exists() or not archive_path.is_file():
messages.error(
doc.pk,
"Archived version of document does not exist.",
)
else:
if archive_path in present_files:
present_files.remove(archive_path)
try:
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
except OSError as e:
messages.error(
doc.pk,
f"Cannot read archive file of document : {e}",
)
else:
if checksum != doc.archive_checksum:
messages.error(
doc.pk,
"Checksum mismatch of archived document. "
f"Stored: {doc.archive_checksum}, "
f"actual: {checksum}.",
)
# other document checks
if not doc.content:
messages.info(doc.pk, "Document contains no OCR data")
progress_bar.update(task, advance=1)
for extra_file in present_files:
messages.warning(None, f"Orphaned file in media dir: {extra_file}")

View File

@@ -8,7 +8,6 @@ from pathlib import Path
from tempfile import TemporaryDirectory
from tempfile import mkstemp
import tqdm
from celery import Task
from celery import shared_task
from celery import states
@@ -19,6 +18,11 @@ from django.db import transaction
from django.db.models.signals import post_save
from django.utils import timezone
from filelock import FileLock
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from whoosh.writing import AsyncWriter
from documents import index
@@ -83,9 +87,20 @@ def index_reindex(*, progress_bar_disable=False) -> None:
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
with (
AsyncWriter(ix) as writer,
Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=progress_bar_disable,
) as progress,
):
task = progress.add_task("Reindexing documents", total=documents.count())
for document in documents:
index.update_document(writer, document)
progress.update(task, advance=1)
@shared_task

View File

@@ -5,7 +5,6 @@ from pathlib import Path
import faiss
import llama_index.core.settings as llama_settings
import tqdm
from celery import states
from django.conf import settings
from django.utils import timezone
@@ -22,6 +21,11 @@ from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.vector_stores.faiss import FaissVectorStore
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskProgressColumn
from rich.progress import TextColumn
from rich.progress import TimeRemainingColumn
from documents.models import Document
from documents.models import PaperlessTask
@@ -176,9 +180,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
embed_model = get_embedding_model()
llama_settings.Settings.embed_model = embed_model
storage_context = get_or_create_storage_context(rebuild=True)
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
document_nodes = build_document_node(document)
nodes.extend(document_nodes)
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=progress_bar_disable,
) as progress:
task = progress.add_task("Building document nodes", total=documents.count())
for document in documents:
document_nodes = build_document_node(document)
nodes.extend(document_nodes)
progress.update(task, advance=1)
index = VectorStoreIndex(
nodes=nodes,
@@ -196,23 +209,33 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
for node in index.docstore.get_nodes(all_node_ids)
}
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
doc_id = str(document.id)
document_modified = document.modified.isoformat()
with Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TaskProgressColumn(),
TimeRemainingColumn(),
disable=progress_bar_disable,
) as progress:
task = progress.add_task("Updating index nodes", total=documents.count())
for document in documents:
doc_id = str(document.id)
document_modified = document.modified.isoformat()
if doc_id in existing_nodes:
node = existing_nodes[doc_id]
node_modified = node.metadata.get("modified")
if doc_id in existing_nodes:
node = existing_nodes[doc_id]
node_modified = node.metadata.get("modified")
if node_modified == document_modified:
continue
if node_modified == document_modified:
progress.update(task, advance=1)
continue
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
index.docstore.delete_document(node.node_id)
nodes.extend(build_document_node(document))
else:
# New document, add it
nodes.extend(build_document_node(document))
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
index.docstore.delete_document(node.node_id)
nodes.extend(build_document_node(document))
else:
# New document, add it
nodes.extend(build_document_node(document))
progress.update(task, advance=1)
if nodes:
msg = "LLM index updated successfully."

View File

@@ -76,6 +76,7 @@ def test_update_llm_index(
mock_queryset = MagicMock()
mock_queryset.exists.return_value = True
mock_queryset.__iter__.return_value = iter([real_document])
mock_queryset.count.return_value = 1
mock_all.return_value = mock_queryset
indexing.update_llm_index(rebuild=True)
@@ -97,6 +98,7 @@ def test_update_llm_index_removes_meta(
mock_queryset = MagicMock()
mock_queryset.exists.return_value = True
mock_queryset.__iter__.return_value = iter([real_document])
mock_queryset.count.return_value = 1
mock_all.return_value = mock_queryset
indexing.update_llm_index(rebuild=True)
@@ -129,6 +131,7 @@ def test_update_llm_index_partial_update(
mock_queryset = MagicMock()
mock_queryset.exists.return_value = True
mock_queryset.__iter__.return_value = iter([real_document, doc2])
mock_queryset.count.return_value = 2
mock_all.return_value = mock_queryset
indexing.update_llm_index(rebuild=True)
@@ -149,6 +152,7 @@ def test_update_llm_index_partial_update(
mock_queryset = MagicMock()
mock_queryset.exists.return_value = True
mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
mock_queryset.count.return_value = 3
mock_all.return_value = mock_queryset
# assert logs "Updating LLM index with %d new nodes and removing %d old nodes."

16
uv.lock generated
View File

@@ -3083,7 +3083,6 @@ dependencies = [
{ name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
{ name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'linux'" },
{ name = "tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "watchfiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "whitenoise", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "whoosh-reloaded", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -3166,7 +3165,6 @@ typing = [
{ name = "types-pytz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "types-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "types-setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "types-tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
[package.metadata]
@@ -3237,7 +3235,6 @@ requires-dist = [
{ name = "setproctitle", specifier = "~=1.3.4" },
{ name = "tika-client", specifier = "~=0.10.0" },
{ name = "torch", specifier = "~=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
{ name = "tqdm", specifier = "~=4.67.1" },
{ name = "watchfiles", specifier = ">=1.1.1" },
{ name = "whitenoise", specifier = "~=6.11" },
{ name = "whoosh-reloaded", specifier = ">=2.7.5" },
@@ -3304,7 +3301,6 @@ typing = [
{ name = "types-pytz" },
{ name = "types-redis" },
{ name = "types-setuptools" },
{ name = "types-tqdm" },
]
[[package]]
@@ -5584,18 +5580,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2b/7f/016dc5cc718ec6ccaa84fb73ed409ef1c261793fd5e637cdfaa18beb40a9/types_setuptools-80.10.0.20260124-py3-none-any.whl", hash = "sha256:efed7e044f01adb9c2806c7a8e1b6aa3656b8e382379b53d5f26ee3db24d4c01", size = 64333, upload-time = "2026-01-24T03:18:38.344Z" },
]
[[package]]
name = "types-tqdm"
version = "4.67.3.20260205"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "types-requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/53/46/790b9872523a48163bdda87d47849b4466017640e5259d06eed539340afd/types_tqdm-4.67.3.20260205.tar.gz", hash = "sha256:f3023682d4aa3bbbf908c8c6bb35f35692d319460d9bbd3e646e8852f3dd9f85", size = 17597, upload-time = "2026-02-05T04:03:19.721Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/cc/da/7f761868dbaa328392356fab30c18ab90d14cce86b269e7e63328f29d4a3/types_tqdm-4.67.3.20260205-py3-none-any.whl", hash = "sha256:85c31731e81dc3c5cecc34c6c8b2e5166fafa722468f58840c2b5ac6a8c5c173", size = 23894, upload-time = "2026-02-05T04:03:18.48Z" },
]
[[package]]
name = "types-webencodings"
version = "0.5.0.20251108"