mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-02-09 23:49:29 -06:00
Compare commits
5 Commits
dev
...
chore/swit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
59a25f8601 | ||
|
|
5950f22e87 | ||
|
|
40ff58ad39 | ||
|
|
914362224c | ||
|
|
e0b45539a6 |
@@ -277,8 +277,6 @@ src/documents/management/commands/document_exporter.py:0: error: Skipping analyz
|
||||
src/documents/management/commands/document_exporter.py:0: error: Skipping analyzing "auditlog.models": module is installed, but missing library stubs or py.typed marker [import-untyped]
|
||||
src/documents/management/commands/document_fuzzy_match.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/documents/management/commands/document_fuzzy_match.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||
src/documents/management/commands/document_importer.py:0: error: Argument 1 to "create_source_path_directory" has incompatible type "Path | None"; expected "Path" [arg-type]
|
||||
src/documents/management/commands/document_importer.py:0: error: Argument 2 to "copy_file_with_basic_stats" has incompatible type "Path | None"; expected "Path | str" [arg-type]
|
||||
src/documents/management/commands/document_importer.py:0: error: Attribute "version" already defined on line 0 [no-redef]
|
||||
src/documents/management/commands/document_importer.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
src/documents/management/commands/document_importer.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||
|
||||
@@ -77,7 +77,6 @@ dependencies = [
|
||||
"setproctitle~=1.3.4",
|
||||
"tika-client~=0.10.0",
|
||||
"torch~=2.10.0",
|
||||
"tqdm~=4.67.1",
|
||||
"watchfiles>=1.1.1",
|
||||
"whitenoise~=6.11",
|
||||
"whoosh-reloaded>=2.7.5",
|
||||
@@ -150,7 +149,6 @@ typing = [
|
||||
"types-pytz",
|
||||
"types-redis",
|
||||
"types-setuptools",
|
||||
"types-tqdm",
|
||||
]
|
||||
|
||||
[tool.uv]
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
import logging
|
||||
import multiprocessing
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.conf import settings
|
||||
from django.core.management.base import BaseCommand
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
@@ -75,20 +79,24 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
try:
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
if self.process_count == 1:
|
||||
for doc_id in document_ids:
|
||||
update_document_content_maybe_archive_file(doc_id)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(self.process_count) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(
|
||||
update_document_content_maybe_archive_file,
|
||||
document_ids,
|
||||
),
|
||||
total=len(document_ids),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task("Archiving documents", total=len(document_ids))
|
||||
if self.process_count == 1:
|
||||
for doc_id in document_ids:
|
||||
update_document_content_maybe_archive_file(doc_id)
|
||||
progress.update(task, advance=1)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(self.process_count) as pool:
|
||||
for _ in pool.imap_unordered(
|
||||
update_document_content_maybe_archive_file,
|
||||
document_ids,
|
||||
):
|
||||
progress.update(task, advance=1)
|
||||
except KeyboardInterrupt:
|
||||
self.stdout.write(self.style.NOTICE("Aborting..."))
|
||||
|
||||
@@ -6,7 +6,6 @@ import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import tqdm
|
||||
from allauth.mfa.models import Authenticator
|
||||
from allauth.socialaccount.models import SocialAccount
|
||||
from allauth.socialaccount.models import SocialApp
|
||||
@@ -24,6 +23,11 @@ from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from guardian.models import GroupObjectPermission
|
||||
from guardian.models import UserObjectPermission
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
@@ -309,54 +313,64 @@ class Command(CryptMixin, BaseCommand):
|
||||
document_manifest = manifest_dict["documents"]
|
||||
|
||||
# 3. Export files from each document
|
||||
for index, document_dict in tqdm.tqdm(
|
||||
enumerate(document_manifest),
|
||||
total=len(document_manifest),
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
# 3.1. generate a unique filename
|
||||
base_name = self.generate_base_name(document)
|
||||
|
||||
# 3.2. write filenames into manifest
|
||||
original_target, thumbnail_target, archive_target = (
|
||||
self.generate_document_targets(document, base_name, document_dict)
|
||||
) as progress:
|
||||
task = progress.add_task(
|
||||
"Exporting documents",
|
||||
total=len(document_manifest),
|
||||
)
|
||||
for index, document_dict in enumerate(document_manifest):
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
# 3.3. write files to target folder
|
||||
if not self.data_only:
|
||||
self.copy_document_files(
|
||||
document,
|
||||
original_target,
|
||||
thumbnail_target,
|
||||
archive_target,
|
||||
# 3.1. generate a unique filename
|
||||
base_name = self.generate_base_name(document)
|
||||
|
||||
# 3.2. write filenames into manifest
|
||||
original_target, thumbnail_target, archive_target = (
|
||||
self.generate_document_targets(document, base_name, document_dict)
|
||||
)
|
||||
|
||||
if self.split_manifest:
|
||||
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
||||
if self.use_folder_prefix:
|
||||
manifest_name = Path("json") / manifest_name
|
||||
manifest_name = (self.target / manifest_name).resolve()
|
||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
content = [document_manifest[index]]
|
||||
content += list(
|
||||
filter(
|
||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
||||
manifest_dict["notes"],
|
||||
),
|
||||
)
|
||||
content += list(
|
||||
filter(
|
||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
||||
manifest_dict["custom_field_instances"],
|
||||
),
|
||||
)
|
||||
# 3.3. write files to target folder
|
||||
if not self.data_only:
|
||||
self.copy_document_files(
|
||||
document,
|
||||
original_target,
|
||||
thumbnail_target,
|
||||
archive_target,
|
||||
)
|
||||
|
||||
self.check_and_write_json(
|
||||
content,
|
||||
manifest_name,
|
||||
)
|
||||
if self.split_manifest:
|
||||
manifest_name = base_name.with_name(
|
||||
f"{base_name.stem}-manifest.json",
|
||||
)
|
||||
if self.use_folder_prefix:
|
||||
manifest_name = Path("json") / manifest_name
|
||||
manifest_name = (self.target / manifest_name).resolve()
|
||||
manifest_name.parent.mkdir(parents=True, exist_ok=True)
|
||||
content = [document_manifest[index]]
|
||||
content += list(
|
||||
filter(
|
||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
||||
manifest_dict["notes"],
|
||||
),
|
||||
)
|
||||
content += list(
|
||||
filter(
|
||||
lambda d: d["fields"]["document"] == document_dict["pk"],
|
||||
manifest_dict["custom_field_instances"],
|
||||
),
|
||||
)
|
||||
|
||||
self.check_and_write_json(
|
||||
content,
|
||||
manifest_name,
|
||||
)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
# These were exported already
|
||||
if self.split_manifest:
|
||||
|
||||
@@ -3,9 +3,13 @@ import multiprocessing
|
||||
from typing import Final
|
||||
|
||||
import rapidfuzz
|
||||
import tqdm
|
||||
from django.core.management import BaseCommand
|
||||
from django.core.management import CommandError
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
@@ -106,19 +110,25 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
||||
|
||||
# Don't spin up a pool of 1 process
|
||||
if self.process_count == 1:
|
||||
results = []
|
||||
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
|
||||
results.append(_process_and_match(work))
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
results = list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(_process_and_match, work_pkgs),
|
||||
total=len(work_pkgs),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task("Fuzzy matching documents", total=len(work_pkgs))
|
||||
if self.process_count == 1:
|
||||
results = []
|
||||
for work in work_pkgs:
|
||||
results.append(_process_and_match(work))
|
||||
progress.update(task, advance=1)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
results = []
|
||||
for result in pool.imap_unordered(_process_and_match, work_pkgs):
|
||||
results.append(result)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
# Check results
|
||||
messages = []
|
||||
|
||||
@@ -5,10 +5,10 @@ import tempfile
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from zipfile import ZipFile
|
||||
from zipfile import is_zipfile
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
@@ -23,6 +23,11 @@ from django.db import transaction
|
||||
from django.db.models.signals import m2m_changed
|
||||
from django.db.models.signals import post_save
|
||||
from filelock import FileLock
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.management.commands.mixins import CryptMixin
|
||||
@@ -365,58 +370,72 @@ class Command(CryptMixin, BaseCommand):
|
||||
filter(lambda r: r["model"] == "documents.document", self.manifest),
|
||||
)
|
||||
|
||||
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task(
|
||||
"Importing documents",
|
||||
total=len(manifest_documents),
|
||||
)
|
||||
for record in manifest_documents:
|
||||
document = Document.objects.get(pk=record["pk"])
|
||||
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
document_path = self.source / doc_file
|
||||
doc_file = record[EXPORTER_FILE_NAME]
|
||||
document_path = self.source / doc_file
|
||||
|
||||
if EXPORTER_THUMBNAIL_NAME in record:
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
thumbnail_path = (self.source / thumb_file).resolve()
|
||||
else:
|
||||
thumbnail_path = None
|
||||
if EXPORTER_THUMBNAIL_NAME in record:
|
||||
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
||||
thumbnail_path = (self.source / thumb_file).resolve()
|
||||
else:
|
||||
thumbnail_path = None
|
||||
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
archive_path = self.source / archive_file
|
||||
else:
|
||||
archive_path = None
|
||||
if EXPORTER_ARCHIVE_NAME in record:
|
||||
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
||||
archive_path = self.source / archive_file
|
||||
else:
|
||||
archive_path = None
|
||||
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
if Path(document.source_path).is_file():
|
||||
raise FileExistsError(document.source_path)
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
if Path(document.source_path).is_file():
|
||||
raise FileExistsError(document.source_path)
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
copy_file_with_basic_stats(document_path, document.source_path)
|
||||
copy_file_with_basic_stats(document_path, document.source_path)
|
||||
|
||||
if thumbnail_path:
|
||||
if thumbnail_path.suffix in {".png", ".PNG"}:
|
||||
run_convert(
|
||||
density=300,
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=False,
|
||||
auto_orient=True,
|
||||
input_file=f"{thumbnail_path}[0]",
|
||||
output_file=str(document.thumbnail_path),
|
||||
)
|
||||
else:
|
||||
copy_file_with_basic_stats(
|
||||
thumbnail_path,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
if thumbnail_path:
|
||||
if thumbnail_path.suffix in {".png", ".PNG"}:
|
||||
run_convert(
|
||||
density=300,
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=False,
|
||||
auto_orient=True,
|
||||
input_file=f"{thumbnail_path}[0]",
|
||||
output_file=str(document.thumbnail_path),
|
||||
)
|
||||
else:
|
||||
copy_file_with_basic_stats(
|
||||
thumbnail_path,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path:
|
||||
create_source_path_directory(document.archive_path)
|
||||
# TODO: this assumes that the export is valid and
|
||||
# archive_filename is present on all documents with
|
||||
# archived files
|
||||
copy_file_with_basic_stats(archive_path, document.archive_path)
|
||||
if archive_path:
|
||||
if TYPE_CHECKING:
|
||||
assert document.archive_path is not None
|
||||
create_source_path_directory(document.archive_path)
|
||||
# TODO: this assumes that the export is valid and
|
||||
# archive_filename is present on all documents with
|
||||
# archived files
|
||||
copy_file_with_basic_stats(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
document.save()
|
||||
progress.update(task, advance=1)
|
||||
|
||||
def decrypt_secret_fields(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models.signals import post_save
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
from documents.models import Document
|
||||
@@ -18,8 +22,15 @@ class Command(ProgressBarMixin, BaseCommand):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
logging.getLogger().handlers[0].level = logging.ERROR
|
||||
|
||||
for document in tqdm.tqdm(
|
||||
Document.objects.all(),
|
||||
documents = Document.objects.all()
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
):
|
||||
post_save.send(Document, instance=document, created=False)
|
||||
) as progress:
|
||||
task = progress.add_task("Renaming documents", total=documents.count())
|
||||
for document in documents:
|
||||
post_save.send(Document, instance=document, created=False)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import logging
|
||||
|
||||
import tqdm
|
||||
from django.core.management.base import BaseCommand
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.classifier import load_classifier
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
@@ -84,53 +88,62 @@ class Command(ProgressBarMixin, BaseCommand):
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
|
||||
if options["correspondent"]:
|
||||
set_correspondent(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task("Retagging documents", total=documents.count())
|
||||
for document in documents:
|
||||
if options["correspondent"]:
|
||||
set_correspondent(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
|
||||
if options["document_type"]:
|
||||
set_document_type(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["document_type"]:
|
||||
set_document_type(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
|
||||
if options["tags"]:
|
||||
set_tags(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["storage_path"]:
|
||||
set_storage_path(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["tags"]:
|
||||
set_tags(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
if options["storage_path"]:
|
||||
set_storage_path(
|
||||
sender=None,
|
||||
document=document,
|
||||
classifier=classifier,
|
||||
replace=options["overwrite"],
|
||||
use_first=options["use_first"],
|
||||
suggest=options["suggest"],
|
||||
base_url=options["base_url"],
|
||||
stdout=self.stdout,
|
||||
style_func=self.style,
|
||||
)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
@@ -2,9 +2,13 @@ import logging
|
||||
import multiprocessing
|
||||
import shutil
|
||||
|
||||
import tqdm
|
||||
from django import db
|
||||
from django.core.management.base import BaseCommand
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import MultiProcessMixin
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
@@ -70,15 +74,19 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
||||
# with postgres.
|
||||
db.connections.close_all()
|
||||
|
||||
if self.process_count == 1:
|
||||
for doc_id in ids:
|
||||
_process_document(doc_id)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap_unordered(_process_document, ids),
|
||||
total=len(ids),
|
||||
disable=self.no_progress_bar,
|
||||
),
|
||||
)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task("Generating thumbnails", total=len(ids))
|
||||
if self.process_count == 1:
|
||||
for doc_id in ids:
|
||||
_process_document(doc_id)
|
||||
progress.update(task, advance=1)
|
||||
else: # pragma: no cover
|
||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||
for _ in pool.imap_unordered(_process_document, ids):
|
||||
progress.update(task, advance=1)
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
from auditlog.models import LogEntry
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db import transaction
|
||||
from tqdm import tqdm
|
||||
from rich.console import Console
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.management.commands.mixins import ProgressBarMixin
|
||||
|
||||
@@ -18,22 +23,37 @@ class Command(BaseCommand, ProgressBarMixin):
|
||||
|
||||
def handle(self, **options):
|
||||
self.handle_progress_bar_mixin(**options)
|
||||
console = Console()
|
||||
with transaction.atomic():
|
||||
for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar):
|
||||
model_class = log_entry.content_type.model_class()
|
||||
# use global_objects for SoftDeleteModel
|
||||
objects = (
|
||||
model_class.global_objects
|
||||
if hasattr(model_class, "global_objects")
|
||||
else model_class.objects
|
||||
log_entries = LogEntry.objects.all()
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
console=console,
|
||||
disable=self.no_progress_bar,
|
||||
) as progress:
|
||||
task = progress.add_task(
|
||||
"Pruning audit logs",
|
||||
total=log_entries.count(),
|
||||
)
|
||||
if (
|
||||
log_entry.object_id
|
||||
and not objects.filter(pk=log_entry.object_id).exists()
|
||||
):
|
||||
log_entry.delete()
|
||||
tqdm.write(
|
||||
self.style.NOTICE(
|
||||
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
|
||||
),
|
||||
for log_entry in log_entries:
|
||||
model_class = log_entry.content_type.model_class()
|
||||
# use global_objects for SoftDeleteModel
|
||||
objects = (
|
||||
model_class.global_objects
|
||||
if hasattr(model_class, "global_objects")
|
||||
else model_class.objects
|
||||
)
|
||||
if (
|
||||
log_entry.object_id
|
||||
and not objects.filter(pk=log_entry.object_id).exists()
|
||||
):
|
||||
log_entry.delete()
|
||||
console.print(
|
||||
self.style.NOTICE(
|
||||
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
|
||||
),
|
||||
)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
@@ -8,7 +8,11 @@ from typing import Final
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from tqdm import tqdm
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
@@ -92,76 +96,99 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
|
||||
if logo_file in present_files:
|
||||
present_files.remove(logo_file)
|
||||
|
||||
for doc in tqdm(Document.global_objects.all(), disable=not progress):
|
||||
# Check sanity of the thumbnail
|
||||
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
|
||||
if not thumbnail_path.exists() or not thumbnail_path.is_file():
|
||||
messages.error(doc.pk, "Thumbnail of document does not exist.")
|
||||
else:
|
||||
if thumbnail_path in present_files:
|
||||
present_files.remove(thumbnail_path)
|
||||
try:
|
||||
_ = thumbnail_path.read_bytes()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
|
||||
|
||||
# Check sanity of the original file
|
||||
# TODO: extract method
|
||||
source_path: Final[Path] = Path(doc.source_path).resolve()
|
||||
if not source_path.exists() or not source_path.is_file():
|
||||
messages.error(doc.pk, "Original of document does not exist.")
|
||||
else:
|
||||
if source_path in present_files:
|
||||
present_files.remove(source_path)
|
||||
try:
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
||||
documents = Document.global_objects.all()
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=not progress,
|
||||
) as progress_bar:
|
||||
task = progress_bar.add_task(
|
||||
"Checking document sanity",
|
||||
total=documents.count(),
|
||||
)
|
||||
for doc in documents:
|
||||
# Check sanity of the thumbnail
|
||||
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
|
||||
if not thumbnail_path.exists() or not thumbnail_path.is_file():
|
||||
messages.error(doc.pk, "Thumbnail of document does not exist.")
|
||||
else:
|
||||
if checksum != doc.checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}.",
|
||||
)
|
||||
|
||||
# Check sanity of the archive file.
|
||||
if doc.archive_checksum is not None and doc.archive_filename is None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file checksum, but no archive filename.",
|
||||
)
|
||||
elif doc.archive_checksum is None and doc.archive_filename is not None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file, but its checksum is missing.",
|
||||
)
|
||||
elif doc.has_archive_version:
|
||||
archive_path: Final[Path] = Path(doc.archive_path).resolve()
|
||||
if not archive_path.exists() or not archive_path.is_file():
|
||||
messages.error(doc.pk, "Archived version of document does not exist.")
|
||||
else:
|
||||
if archive_path in present_files:
|
||||
present_files.remove(archive_path)
|
||||
if thumbnail_path in present_files:
|
||||
present_files.remove(thumbnail_path)
|
||||
try:
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
_ = thumbnail_path.read_bytes()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Cannot read archive file of document : {e}",
|
||||
f"Cannot read thumbnail file of document: {e}",
|
||||
)
|
||||
|
||||
# Check sanity of the original file
|
||||
# TODO: extract method
|
||||
source_path: Final[Path] = Path(doc.source_path).resolve()
|
||||
if not source_path.exists() or not source_path.is_file():
|
||||
messages.error(doc.pk, "Original of document does not exist.")
|
||||
else:
|
||||
if source_path in present_files:
|
||||
present_files.remove(source_path)
|
||||
try:
|
||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Cannot read original file of document: {e}",
|
||||
)
|
||||
else:
|
||||
if checksum != doc.archive_checksum:
|
||||
if checksum != doc.checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch of archived document. "
|
||||
f"Stored: {doc.archive_checksum}, "
|
||||
f"actual: {checksum}.",
|
||||
"Checksum mismatch. "
|
||||
f"Stored: {doc.checksum}, actual: {checksum}.",
|
||||
)
|
||||
|
||||
# other document checks
|
||||
if not doc.content:
|
||||
messages.info(doc.pk, "Document contains no OCR data")
|
||||
# Check sanity of the archive file.
|
||||
if doc.archive_checksum is not None and doc.archive_filename is None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file checksum, but no archive filename.",
|
||||
)
|
||||
elif doc.archive_checksum is None and doc.archive_filename is not None:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Document has an archive file, but its checksum is missing.",
|
||||
)
|
||||
elif doc.has_archive_version:
|
||||
archive_path: Final[Path] = Path(doc.archive_path).resolve()
|
||||
if not archive_path.exists() or not archive_path.is_file():
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Archived version of document does not exist.",
|
||||
)
|
||||
else:
|
||||
if archive_path in present_files:
|
||||
present_files.remove(archive_path)
|
||||
try:
|
||||
checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
|
||||
except OSError as e:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
f"Cannot read archive file of document : {e}",
|
||||
)
|
||||
else:
|
||||
if checksum != doc.archive_checksum:
|
||||
messages.error(
|
||||
doc.pk,
|
||||
"Checksum mismatch of archived document. "
|
||||
f"Stored: {doc.archive_checksum}, "
|
||||
f"actual: {checksum}.",
|
||||
)
|
||||
|
||||
# other document checks
|
||||
if not doc.content:
|
||||
messages.info(doc.pk, "Document contains no OCR data")
|
||||
|
||||
progress_bar.update(task, advance=1)
|
||||
|
||||
for extra_file in present_files:
|
||||
messages.warning(None, f"Orphaned file in media dir: {extra_file}")
|
||||
|
||||
@@ -8,7 +8,6 @@ from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from tempfile import mkstemp
|
||||
|
||||
import tqdm
|
||||
from celery import Task
|
||||
from celery import shared_task
|
||||
from celery import states
|
||||
@@ -19,6 +18,11 @@ from django.db import transaction
|
||||
from django.db.models.signals import post_save
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
from whoosh.writing import AsyncWriter
|
||||
|
||||
from documents import index
|
||||
@@ -83,9 +87,20 @@ def index_reindex(*, progress_bar_disable=False) -> None:
|
||||
|
||||
ix = index.open_index(recreate=True)
|
||||
|
||||
with AsyncWriter(ix) as writer:
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
with (
|
||||
AsyncWriter(ix) as writer,
|
||||
Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=progress_bar_disable,
|
||||
) as progress,
|
||||
):
|
||||
task = progress.add_task("Reindexing documents", total=documents.count())
|
||||
for document in documents:
|
||||
index.update_document(writer, document)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
|
||||
@shared_task
|
||||
|
||||
@@ -5,7 +5,6 @@ from pathlib import Path
|
||||
|
||||
import faiss
|
||||
import llama_index.core.settings as llama_settings
|
||||
import tqdm
|
||||
from celery import states
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
@@ -22,6 +21,11 @@ from llama_index.core.storage.docstore import SimpleDocumentStore
|
||||
from llama_index.core.storage.index_store import SimpleIndexStore
|
||||
from llama_index.core.text_splitter import TokenTextSplitter
|
||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||
from rich.progress import BarColumn
|
||||
from rich.progress import Progress
|
||||
from rich.progress import TaskProgressColumn
|
||||
from rich.progress import TextColumn
|
||||
from rich.progress import TimeRemainingColumn
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import PaperlessTask
|
||||
@@ -176,9 +180,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
||||
embed_model = get_embedding_model()
|
||||
llama_settings.Settings.embed_model = embed_model
|
||||
storage_context = get_or_create_storage_context(rebuild=True)
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
document_nodes = build_document_node(document)
|
||||
nodes.extend(document_nodes)
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=progress_bar_disable,
|
||||
) as progress:
|
||||
task = progress.add_task("Building document nodes", total=documents.count())
|
||||
for document in documents:
|
||||
document_nodes = build_document_node(document)
|
||||
nodes.extend(document_nodes)
|
||||
progress.update(task, advance=1)
|
||||
|
||||
index = VectorStoreIndex(
|
||||
nodes=nodes,
|
||||
@@ -196,23 +209,33 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
||||
for node in index.docstore.get_nodes(all_node_ids)
|
||||
}
|
||||
|
||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
||||
doc_id = str(document.id)
|
||||
document_modified = document.modified.isoformat()
|
||||
with Progress(
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
TimeRemainingColumn(),
|
||||
disable=progress_bar_disable,
|
||||
) as progress:
|
||||
task = progress.add_task("Updating index nodes", total=documents.count())
|
||||
for document in documents:
|
||||
doc_id = str(document.id)
|
||||
document_modified = document.modified.isoformat()
|
||||
|
||||
if doc_id in existing_nodes:
|
||||
node = existing_nodes[doc_id]
|
||||
node_modified = node.metadata.get("modified")
|
||||
if doc_id in existing_nodes:
|
||||
node = existing_nodes[doc_id]
|
||||
node_modified = node.metadata.get("modified")
|
||||
|
||||
if node_modified == document_modified:
|
||||
continue
|
||||
if node_modified == document_modified:
|
||||
progress.update(task, advance=1)
|
||||
continue
|
||||
|
||||
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
|
||||
index.docstore.delete_document(node.node_id)
|
||||
nodes.extend(build_document_node(document))
|
||||
else:
|
||||
# New document, add it
|
||||
nodes.extend(build_document_node(document))
|
||||
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
|
||||
index.docstore.delete_document(node.node_id)
|
||||
nodes.extend(build_document_node(document))
|
||||
else:
|
||||
# New document, add it
|
||||
nodes.extend(build_document_node(document))
|
||||
progress.update(task, advance=1)
|
||||
|
||||
if nodes:
|
||||
msg = "LLM index updated successfully."
|
||||
|
||||
@@ -76,6 +76,7 @@ def test_update_llm_index(
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document])
|
||||
mock_queryset.count.return_value = 1
|
||||
mock_all.return_value = mock_queryset
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
|
||||
@@ -97,6 +98,7 @@ def test_update_llm_index_removes_meta(
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document])
|
||||
mock_queryset.count.return_value = 1
|
||||
mock_all.return_value = mock_queryset
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
|
||||
@@ -129,6 +131,7 @@ def test_update_llm_index_partial_update(
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([real_document, doc2])
|
||||
mock_queryset.count.return_value = 2
|
||||
mock_all.return_value = mock_queryset
|
||||
|
||||
indexing.update_llm_index(rebuild=True)
|
||||
@@ -149,6 +152,7 @@ def test_update_llm_index_partial_update(
|
||||
mock_queryset = MagicMock()
|
||||
mock_queryset.exists.return_value = True
|
||||
mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
|
||||
mock_queryset.count.return_value = 3
|
||||
mock_all.return_value = mock_queryset
|
||||
|
||||
# assert logs "Updating LLM index with %d new nodes and removing %d old nodes."
|
||||
|
||||
16
uv.lock
generated
16
uv.lock
generated
@@ -3083,7 +3083,6 @@ dependencies = [
|
||||
{ name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
|
||||
{ name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'linux'" },
|
||||
{ name = "tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "watchfiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "whitenoise", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "whoosh-reloaded", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -3166,7 +3165,6 @@ typing = [
|
||||
{ name = "types-pytz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "types-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "types-setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "types-tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
@@ -3237,7 +3235,6 @@ requires-dist = [
|
||||
{ name = "setproctitle", specifier = "~=1.3.4" },
|
||||
{ name = "tika-client", specifier = "~=0.10.0" },
|
||||
{ name = "torch", specifier = "~=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
|
||||
{ name = "tqdm", specifier = "~=4.67.1" },
|
||||
{ name = "watchfiles", specifier = ">=1.1.1" },
|
||||
{ name = "whitenoise", specifier = "~=6.11" },
|
||||
{ name = "whoosh-reloaded", specifier = ">=2.7.5" },
|
||||
@@ -3304,7 +3301,6 @@ typing = [
|
||||
{ name = "types-pytz" },
|
||||
{ name = "types-redis" },
|
||||
{ name = "types-setuptools" },
|
||||
{ name = "types-tqdm" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5584,18 +5580,6 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/2b/7f/016dc5cc718ec6ccaa84fb73ed409ef1c261793fd5e637cdfaa18beb40a9/types_setuptools-80.10.0.20260124-py3-none-any.whl", hash = "sha256:efed7e044f01adb9c2806c7a8e1b6aa3656b8e382379b53d5f26ee3db24d4c01", size = 64333, upload-time = "2026-01-24T03:18:38.344Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-tqdm"
|
||||
version = "4.67.3.20260205"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "types-requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/53/46/790b9872523a48163bdda87d47849b4466017640e5259d06eed539340afd/types_tqdm-4.67.3.20260205.tar.gz", hash = "sha256:f3023682d4aa3bbbf908c8c6bb35f35692d319460d9bbd3e646e8852f3dd9f85", size = 17597, upload-time = "2026-02-05T04:03:19.721Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/cc/da/7f761868dbaa328392356fab30c18ab90d14cce86b269e7e63328f29d4a3/types_tqdm-4.67.3.20260205-py3-none-any.whl", hash = "sha256:85c31731e81dc3c5cecc34c6c8b2e5166fafa722468f58840c2b5ac6a8c5c173", size = 23894, upload-time = "2026-02-05T04:03:18.48Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "types-webencodings"
|
||||
version = "0.5.0.20251108"
|
||||
|
||||
Reference in New Issue
Block a user