mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-02-09 23:49:29 -06:00
Compare commits
5 Commits
l10n_dev
...
chore/swit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
59a25f8601 | ||
|
|
5950f22e87 | ||
|
|
40ff58ad39 | ||
|
|
914362224c | ||
|
|
e0b45539a6 |
@@ -277,8 +277,6 @@ src/documents/management/commands/document_exporter.py:0: error: Skipping analyz
|
|||||||
src/documents/management/commands/document_exporter.py:0: error: Skipping analyzing "auditlog.models": module is installed, but missing library stubs or py.typed marker [import-untyped]
|
src/documents/management/commands/document_exporter.py:0: error: Skipping analyzing "auditlog.models": module is installed, but missing library stubs or py.typed marker [import-untyped]
|
||||||
src/documents/management/commands/document_fuzzy_match.py:0: error: Function is missing a type annotation [no-untyped-def]
|
src/documents/management/commands/document_fuzzy_match.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||||
src/documents/management/commands/document_fuzzy_match.py:0: error: Function is missing a type annotation [no-untyped-def]
|
src/documents/management/commands/document_fuzzy_match.py:0: error: Function is missing a type annotation [no-untyped-def]
|
||||||
src/documents/management/commands/document_importer.py:0: error: Argument 1 to "create_source_path_directory" has incompatible type "Path | None"; expected "Path" [arg-type]
|
|
||||||
src/documents/management/commands/document_importer.py:0: error: Argument 2 to "copy_file_with_basic_stats" has incompatible type "Path | None"; expected "Path | str" [arg-type]
|
|
||||||
src/documents/management/commands/document_importer.py:0: error: Attribute "version" already defined on line 0 [no-redef]
|
src/documents/management/commands/document_importer.py:0: error: Attribute "version" already defined on line 0 [no-redef]
|
||||||
src/documents/management/commands/document_importer.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
src/documents/management/commands/document_importer.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||||
src/documents/management/commands/document_importer.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
src/documents/management/commands/document_importer.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def]
|
||||||
|
|||||||
@@ -77,7 +77,6 @@ dependencies = [
|
|||||||
"setproctitle~=1.3.4",
|
"setproctitle~=1.3.4",
|
||||||
"tika-client~=0.10.0",
|
"tika-client~=0.10.0",
|
||||||
"torch~=2.10.0",
|
"torch~=2.10.0",
|
||||||
"tqdm~=4.67.1",
|
|
||||||
"watchfiles>=1.1.1",
|
"watchfiles>=1.1.1",
|
||||||
"whitenoise~=6.11",
|
"whitenoise~=6.11",
|
||||||
"whoosh-reloaded>=2.7.5",
|
"whoosh-reloaded>=2.7.5",
|
||||||
@@ -150,7 +149,6 @@ typing = [
|
|||||||
"types-pytz",
|
"types-pytz",
|
||||||
"types-redis",
|
"types-redis",
|
||||||
"types-setuptools",
|
"types-setuptools",
|
||||||
"types-tqdm",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.uv]
|
[tool.uv]
|
||||||
|
|||||||
@@ -1,10 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
|
||||||
import tqdm
|
|
||||||
from django import db
|
from django import db
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.management.commands.mixins import MultiProcessMixin
|
from documents.management.commands.mixins import MultiProcessMixin
|
||||||
from documents.management.commands.mixins import ProgressBarMixin
|
from documents.management.commands.mixins import ProgressBarMixin
|
||||||
@@ -75,20 +79,24 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
try:
|
try:
|
||||||
logging.getLogger().handlers[0].level = logging.ERROR
|
logging.getLogger().handlers[0].level = logging.ERROR
|
||||||
|
|
||||||
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=self.no_progress_bar,
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Archiving documents", total=len(document_ids))
|
||||||
if self.process_count == 1:
|
if self.process_count == 1:
|
||||||
for doc_id in document_ids:
|
for doc_id in document_ids:
|
||||||
update_document_content_maybe_archive_file(doc_id)
|
update_document_content_maybe_archive_file(doc_id)
|
||||||
|
progress.update(task, advance=1)
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
with multiprocessing.Pool(self.process_count) as pool:
|
with multiprocessing.Pool(self.process_count) as pool:
|
||||||
list(
|
for _ in pool.imap_unordered(
|
||||||
tqdm.tqdm(
|
|
||||||
pool.imap_unordered(
|
|
||||||
update_document_content_maybe_archive_file,
|
update_document_content_maybe_archive_file,
|
||||||
document_ids,
|
document_ids,
|
||||||
),
|
):
|
||||||
total=len(document_ids),
|
progress.update(task, advance=1)
|
||||||
disable=self.no_progress_bar,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
self.stdout.write(self.style.NOTICE("Aborting..."))
|
self.stdout.write(self.style.NOTICE("Aborting..."))
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import tempfile
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import tqdm
|
|
||||||
from allauth.mfa.models import Authenticator
|
from allauth.mfa.models import Authenticator
|
||||||
from allauth.socialaccount.models import SocialAccount
|
from allauth.socialaccount.models import SocialAccount
|
||||||
from allauth.socialaccount.models import SocialApp
|
from allauth.socialaccount.models import SocialApp
|
||||||
@@ -24,6 +23,11 @@ from django.utils import timezone
|
|||||||
from filelock import FileLock
|
from filelock import FileLock
|
||||||
from guardian.models import GroupObjectPermission
|
from guardian.models import GroupObjectPermission
|
||||||
from guardian.models import UserObjectPermission
|
from guardian.models import UserObjectPermission
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from django.db.models import QuerySet
|
from django.db.models import QuerySet
|
||||||
@@ -309,11 +313,18 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
document_manifest = manifest_dict["documents"]
|
document_manifest = manifest_dict["documents"]
|
||||||
|
|
||||||
# 3. Export files from each document
|
# 3. Export files from each document
|
||||||
for index, document_dict in tqdm.tqdm(
|
with Progress(
|
||||||
enumerate(document_manifest),
|
TextColumn("[progress.description]{task.description}"),
|
||||||
total=len(document_manifest),
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
disable=self.no_progress_bar,
|
disable=self.no_progress_bar,
|
||||||
):
|
) as progress:
|
||||||
|
task = progress.add_task(
|
||||||
|
"Exporting documents",
|
||||||
|
total=len(document_manifest),
|
||||||
|
)
|
||||||
|
for index, document_dict in enumerate(document_manifest):
|
||||||
document = document_map[document_dict["pk"]]
|
document = document_map[document_dict["pk"]]
|
||||||
|
|
||||||
# 3.1. generate a unique filename
|
# 3.1. generate a unique filename
|
||||||
@@ -334,7 +345,9 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.split_manifest:
|
if self.split_manifest:
|
||||||
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
|
manifest_name = base_name.with_name(
|
||||||
|
f"{base_name.stem}-manifest.json",
|
||||||
|
)
|
||||||
if self.use_folder_prefix:
|
if self.use_folder_prefix:
|
||||||
manifest_name = Path("json") / manifest_name
|
manifest_name = Path("json") / manifest_name
|
||||||
manifest_name = (self.target / manifest_name).resolve()
|
manifest_name = (self.target / manifest_name).resolve()
|
||||||
@@ -357,6 +370,7 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
content,
|
content,
|
||||||
manifest_name,
|
manifest_name,
|
||||||
)
|
)
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|
||||||
# These were exported already
|
# These were exported already
|
||||||
if self.split_manifest:
|
if self.split_manifest:
|
||||||
|
|||||||
@@ -3,9 +3,13 @@ import multiprocessing
|
|||||||
from typing import Final
|
from typing import Final
|
||||||
|
|
||||||
import rapidfuzz
|
import rapidfuzz
|
||||||
import tqdm
|
|
||||||
from django.core.management import BaseCommand
|
from django.core.management import BaseCommand
|
||||||
from django.core.management import CommandError
|
from django.core.management import CommandError
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.management.commands.mixins import MultiProcessMixin
|
from documents.management.commands.mixins import MultiProcessMixin
|
||||||
from documents.management.commands.mixins import ProgressBarMixin
|
from documents.management.commands.mixins import ProgressBarMixin
|
||||||
@@ -106,19 +110,25 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
||||||
|
|
||||||
# Don't spin up a pool of 1 process
|
# Don't spin up a pool of 1 process
|
||||||
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=self.no_progress_bar,
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Fuzzy matching documents", total=len(work_pkgs))
|
||||||
if self.process_count == 1:
|
if self.process_count == 1:
|
||||||
results = []
|
results = []
|
||||||
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
|
for work in work_pkgs:
|
||||||
results.append(_process_and_match(work))
|
results.append(_process_and_match(work))
|
||||||
|
progress.update(task, advance=1)
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||||
results = list(
|
results = []
|
||||||
tqdm.tqdm(
|
for result in pool.imap_unordered(_process_and_match, work_pkgs):
|
||||||
pool.imap_unordered(_process_and_match, work_pkgs),
|
results.append(result)
|
||||||
total=len(work_pkgs),
|
progress.update(task, advance=1)
|
||||||
disable=self.no_progress_bar,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check results
|
# Check results
|
||||||
messages = []
|
messages = []
|
||||||
|
|||||||
@@ -5,10 +5,10 @@ import tempfile
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
from zipfile import is_zipfile
|
from zipfile import is_zipfile
|
||||||
|
|
||||||
import tqdm
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.contrib.auth.models import Permission
|
from django.contrib.auth.models import Permission
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
@@ -23,6 +23,11 @@ from django.db import transaction
|
|||||||
from django.db.models.signals import m2m_changed
|
from django.db.models.signals import m2m_changed
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
from filelock import FileLock
|
from filelock import FileLock
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.file_handling import create_source_path_directory
|
from documents.file_handling import create_source_path_directory
|
||||||
from documents.management.commands.mixins import CryptMixin
|
from documents.management.commands.mixins import CryptMixin
|
||||||
@@ -365,7 +370,18 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
filter(lambda r: r["model"] == "documents.document", self.manifest),
|
filter(lambda r: r["model"] == "documents.document", self.manifest),
|
||||||
)
|
)
|
||||||
|
|
||||||
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=self.no_progress_bar,
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task(
|
||||||
|
"Importing documents",
|
||||||
|
total=len(manifest_documents),
|
||||||
|
)
|
||||||
|
for record in manifest_documents:
|
||||||
document = Document.objects.get(pk=record["pk"])
|
document = Document.objects.get(pk=record["pk"])
|
||||||
|
|
||||||
doc_file = record[EXPORTER_FILE_NAME]
|
doc_file = record[EXPORTER_FILE_NAME]
|
||||||
@@ -410,6 +426,8 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if archive_path:
|
if archive_path:
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
assert document.archive_path is not None
|
||||||
create_source_path_directory(document.archive_path)
|
create_source_path_directory(document.archive_path)
|
||||||
# TODO: this assumes that the export is valid and
|
# TODO: this assumes that the export is valid and
|
||||||
# archive_filename is present on all documents with
|
# archive_filename is present on all documents with
|
||||||
@@ -417,6 +435,7 @@ class Command(CryptMixin, BaseCommand):
|
|||||||
copy_file_with_basic_stats(archive_path, document.archive_path)
|
copy_file_with_basic_stats(archive_path, document.archive_path)
|
||||||
|
|
||||||
document.save()
|
document.save()
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|
||||||
def decrypt_secret_fields(self) -> None:
|
def decrypt_secret_fields(self) -> None:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import tqdm
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.management.commands.mixins import ProgressBarMixin
|
from documents.management.commands.mixins import ProgressBarMixin
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
@@ -18,8 +22,15 @@ class Command(ProgressBarMixin, BaseCommand):
|
|||||||
self.handle_progress_bar_mixin(**options)
|
self.handle_progress_bar_mixin(**options)
|
||||||
logging.getLogger().handlers[0].level = logging.ERROR
|
logging.getLogger().handlers[0].level = logging.ERROR
|
||||||
|
|
||||||
for document in tqdm.tqdm(
|
documents = Document.objects.all()
|
||||||
Document.objects.all(),
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
disable=self.no_progress_bar,
|
disable=self.no_progress_bar,
|
||||||
):
|
) as progress:
|
||||||
|
task = progress.add_task("Renaming documents", total=documents.count())
|
||||||
|
for document in documents:
|
||||||
post_save.send(Document, instance=document, created=False)
|
post_save.send(Document, instance=document, created=False)
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|||||||
@@ -1,7 +1,11 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import tqdm
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.classifier import load_classifier
|
from documents.classifier import load_classifier
|
||||||
from documents.management.commands.mixins import ProgressBarMixin
|
from documents.management.commands.mixins import ProgressBarMixin
|
||||||
@@ -84,7 +88,15 @@ class Command(ProgressBarMixin, BaseCommand):
|
|||||||
|
|
||||||
classifier = load_classifier()
|
classifier = load_classifier()
|
||||||
|
|
||||||
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=self.no_progress_bar,
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Retagging documents", total=documents.count())
|
||||||
|
for document in documents:
|
||||||
if options["correspondent"]:
|
if options["correspondent"]:
|
||||||
set_correspondent(
|
set_correspondent(
|
||||||
sender=None,
|
sender=None,
|
||||||
@@ -134,3 +146,4 @@ class Command(ProgressBarMixin, BaseCommand):
|
|||||||
stdout=self.stdout,
|
stdout=self.stdout,
|
||||||
style_func=self.style,
|
style_func=self.style,
|
||||||
)
|
)
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|||||||
@@ -2,9 +2,13 @@ import logging
|
|||||||
import multiprocessing
|
import multiprocessing
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
import tqdm
|
|
||||||
from django import db
|
from django import db
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.management.commands.mixins import MultiProcessMixin
|
from documents.management.commands.mixins import MultiProcessMixin
|
||||||
from documents.management.commands.mixins import ProgressBarMixin
|
from documents.management.commands.mixins import ProgressBarMixin
|
||||||
@@ -70,15 +74,19 @@ class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|||||||
# with postgres.
|
# with postgres.
|
||||||
db.connections.close_all()
|
db.connections.close_all()
|
||||||
|
|
||||||
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=self.no_progress_bar,
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Generating thumbnails", total=len(ids))
|
||||||
if self.process_count == 1:
|
if self.process_count == 1:
|
||||||
for doc_id in ids:
|
for doc_id in ids:
|
||||||
_process_document(doc_id)
|
_process_document(doc_id)
|
||||||
|
progress.update(task, advance=1)
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
with multiprocessing.Pool(processes=self.process_count) as pool:
|
with multiprocessing.Pool(processes=self.process_count) as pool:
|
||||||
list(
|
for _ in pool.imap_unordered(_process_document, ids):
|
||||||
tqdm.tqdm(
|
progress.update(task, advance=1)
|
||||||
pool.imap_unordered(_process_document, ids),
|
|
||||||
total=len(ids),
|
|
||||||
disable=self.no_progress_bar,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
from auditlog.models import LogEntry
|
from auditlog.models import LogEntry
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
from tqdm import tqdm
|
from rich.console import Console
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.management.commands.mixins import ProgressBarMixin
|
from documents.management.commands.mixins import ProgressBarMixin
|
||||||
|
|
||||||
@@ -18,8 +23,22 @@ class Command(BaseCommand, ProgressBarMixin):
|
|||||||
|
|
||||||
def handle(self, **options):
|
def handle(self, **options):
|
||||||
self.handle_progress_bar_mixin(**options)
|
self.handle_progress_bar_mixin(**options)
|
||||||
|
console = Console()
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
for log_entry in tqdm(LogEntry.objects.all(), disable=self.no_progress_bar):
|
log_entries = LogEntry.objects.all()
|
||||||
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
console=console,
|
||||||
|
disable=self.no_progress_bar,
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task(
|
||||||
|
"Pruning audit logs",
|
||||||
|
total=log_entries.count(),
|
||||||
|
)
|
||||||
|
for log_entry in log_entries:
|
||||||
model_class = log_entry.content_type.model_class()
|
model_class = log_entry.content_type.model_class()
|
||||||
# use global_objects for SoftDeleteModel
|
# use global_objects for SoftDeleteModel
|
||||||
objects = (
|
objects = (
|
||||||
@@ -32,8 +51,9 @@ class Command(BaseCommand, ProgressBarMixin):
|
|||||||
and not objects.filter(pk=log_entry.object_id).exists()
|
and not objects.filter(pk=log_entry.object_id).exists()
|
||||||
):
|
):
|
||||||
log_entry.delete()
|
log_entry.delete()
|
||||||
tqdm.write(
|
console.print(
|
||||||
self.style.NOTICE(
|
self.style.NOTICE(
|
||||||
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
|
f"Deleted audit log entry for {model_class.__name__} #{log_entry.object_id}",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|||||||
@@ -8,7 +8,11 @@ from typing import Final
|
|||||||
from celery import states
|
from celery import states
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from tqdm import tqdm
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import PaperlessTask
|
from documents.models import PaperlessTask
|
||||||
@@ -92,7 +96,19 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
|
|||||||
if logo_file in present_files:
|
if logo_file in present_files:
|
||||||
present_files.remove(logo_file)
|
present_files.remove(logo_file)
|
||||||
|
|
||||||
for doc in tqdm(Document.global_objects.all(), disable=not progress):
|
documents = Document.global_objects.all()
|
||||||
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=not progress,
|
||||||
|
) as progress_bar:
|
||||||
|
task = progress_bar.add_task(
|
||||||
|
"Checking document sanity",
|
||||||
|
total=documents.count(),
|
||||||
|
)
|
||||||
|
for doc in documents:
|
||||||
# Check sanity of the thumbnail
|
# Check sanity of the thumbnail
|
||||||
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
|
thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
|
||||||
if not thumbnail_path.exists() or not thumbnail_path.is_file():
|
if not thumbnail_path.exists() or not thumbnail_path.is_file():
|
||||||
@@ -103,7 +119,10 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
|
|||||||
try:
|
try:
|
||||||
_ = thumbnail_path.read_bytes()
|
_ = thumbnail_path.read_bytes()
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")
|
messages.error(
|
||||||
|
doc.pk,
|
||||||
|
f"Cannot read thumbnail file of document: {e}",
|
||||||
|
)
|
||||||
|
|
||||||
# Check sanity of the original file
|
# Check sanity of the original file
|
||||||
# TODO: extract method
|
# TODO: extract method
|
||||||
@@ -116,7 +135,10 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
|
|||||||
try:
|
try:
|
||||||
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
messages.error(doc.pk, f"Cannot read original file of document: {e}")
|
messages.error(
|
||||||
|
doc.pk,
|
||||||
|
f"Cannot read original file of document: {e}",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if checksum != doc.checksum:
|
if checksum != doc.checksum:
|
||||||
messages.error(
|
messages.error(
|
||||||
@@ -139,7 +161,10 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
|
|||||||
elif doc.has_archive_version:
|
elif doc.has_archive_version:
|
||||||
archive_path: Final[Path] = Path(doc.archive_path).resolve()
|
archive_path: Final[Path] = Path(doc.archive_path).resolve()
|
||||||
if not archive_path.exists() or not archive_path.is_file():
|
if not archive_path.exists() or not archive_path.is_file():
|
||||||
messages.error(doc.pk, "Archived version of document does not exist.")
|
messages.error(
|
||||||
|
doc.pk,
|
||||||
|
"Archived version of document does not exist.",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if archive_path in present_files:
|
if archive_path in present_files:
|
||||||
present_files.remove(archive_path)
|
present_files.remove(archive_path)
|
||||||
@@ -163,6 +188,8 @@ def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
|
|||||||
if not doc.content:
|
if not doc.content:
|
||||||
messages.info(doc.pk, "Document contains no OCR data")
|
messages.info(doc.pk, "Document contains no OCR data")
|
||||||
|
|
||||||
|
progress_bar.update(task, advance=1)
|
||||||
|
|
||||||
for extra_file in present_files:
|
for extra_file in present_files:
|
||||||
messages.warning(None, f"Orphaned file in media dir: {extra_file}")
|
messages.warning(None, f"Orphaned file in media dir: {extra_file}")
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ from pathlib import Path
|
|||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
from tempfile import mkstemp
|
from tempfile import mkstemp
|
||||||
|
|
||||||
import tqdm
|
|
||||||
from celery import Task
|
from celery import Task
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from celery import states
|
from celery import states
|
||||||
@@ -19,6 +18,11 @@ from django.db import transaction
|
|||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from filelock import FileLock
|
from filelock import FileLock
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
from whoosh.writing import AsyncWriter
|
from whoosh.writing import AsyncWriter
|
||||||
|
|
||||||
from documents import index
|
from documents import index
|
||||||
@@ -83,9 +87,20 @@ def index_reindex(*, progress_bar_disable=False) -> None:
|
|||||||
|
|
||||||
ix = index.open_index(recreate=True)
|
ix = index.open_index(recreate=True)
|
||||||
|
|
||||||
with AsyncWriter(ix) as writer:
|
with (
|
||||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
AsyncWriter(ix) as writer,
|
||||||
|
Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=progress_bar_disable,
|
||||||
|
) as progress,
|
||||||
|
):
|
||||||
|
task = progress.add_task("Reindexing documents", total=documents.count())
|
||||||
|
for document in documents:
|
||||||
index.update_document(writer, document)
|
index.update_document(writer, document)
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
import faiss
|
import faiss
|
||||||
import llama_index.core.settings as llama_settings
|
import llama_index.core.settings as llama_settings
|
||||||
import tqdm
|
|
||||||
from celery import states
|
from celery import states
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
@@ -22,6 +21,11 @@ from llama_index.core.storage.docstore import SimpleDocumentStore
|
|||||||
from llama_index.core.storage.index_store import SimpleIndexStore
|
from llama_index.core.storage.index_store import SimpleIndexStore
|
||||||
from llama_index.core.text_splitter import TokenTextSplitter
|
from llama_index.core.text_splitter import TokenTextSplitter
|
||||||
from llama_index.vector_stores.faiss import FaissVectorStore
|
from llama_index.vector_stores.faiss import FaissVectorStore
|
||||||
|
from rich.progress import BarColumn
|
||||||
|
from rich.progress import Progress
|
||||||
|
from rich.progress import TaskProgressColumn
|
||||||
|
from rich.progress import TextColumn
|
||||||
|
from rich.progress import TimeRemainingColumn
|
||||||
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
from documents.models import PaperlessTask
|
from documents.models import PaperlessTask
|
||||||
@@ -176,9 +180,18 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
|||||||
embed_model = get_embedding_model()
|
embed_model = get_embedding_model()
|
||||||
llama_settings.Settings.embed_model = embed_model
|
llama_settings.Settings.embed_model = embed_model
|
||||||
storage_context = get_or_create_storage_context(rebuild=True)
|
storage_context = get_or_create_storage_context(rebuild=True)
|
||||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=progress_bar_disable,
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Building document nodes", total=documents.count())
|
||||||
|
for document in documents:
|
||||||
document_nodes = build_document_node(document)
|
document_nodes = build_document_node(document)
|
||||||
nodes.extend(document_nodes)
|
nodes.extend(document_nodes)
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|
||||||
index = VectorStoreIndex(
|
index = VectorStoreIndex(
|
||||||
nodes=nodes,
|
nodes=nodes,
|
||||||
@@ -196,7 +209,15 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
|||||||
for node in index.docstore.get_nodes(all_node_ids)
|
for node in index.docstore.get_nodes(all_node_ids)
|
||||||
}
|
}
|
||||||
|
|
||||||
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
with Progress(
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TaskProgressColumn(),
|
||||||
|
TimeRemainingColumn(),
|
||||||
|
disable=progress_bar_disable,
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Updating index nodes", total=documents.count())
|
||||||
|
for document in documents:
|
||||||
doc_id = str(document.id)
|
doc_id = str(document.id)
|
||||||
document_modified = document.modified.isoformat()
|
document_modified = document.modified.isoformat()
|
||||||
|
|
||||||
@@ -205,6 +226,7 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
|||||||
node_modified = node.metadata.get("modified")
|
node_modified = node.metadata.get("modified")
|
||||||
|
|
||||||
if node_modified == document_modified:
|
if node_modified == document_modified:
|
||||||
|
progress.update(task, advance=1)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
|
# Again, delete from docstore, FAISS IndexFlatL2 are append-only
|
||||||
@@ -213,6 +235,7 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False) -> str:
|
|||||||
else:
|
else:
|
||||||
# New document, add it
|
# New document, add it
|
||||||
nodes.extend(build_document_node(document))
|
nodes.extend(build_document_node(document))
|
||||||
|
progress.update(task, advance=1)
|
||||||
|
|
||||||
if nodes:
|
if nodes:
|
||||||
msg = "LLM index updated successfully."
|
msg = "LLM index updated successfully."
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ def test_update_llm_index(
|
|||||||
mock_queryset = MagicMock()
|
mock_queryset = MagicMock()
|
||||||
mock_queryset.exists.return_value = True
|
mock_queryset.exists.return_value = True
|
||||||
mock_queryset.__iter__.return_value = iter([real_document])
|
mock_queryset.__iter__.return_value = iter([real_document])
|
||||||
|
mock_queryset.count.return_value = 1
|
||||||
mock_all.return_value = mock_queryset
|
mock_all.return_value = mock_queryset
|
||||||
indexing.update_llm_index(rebuild=True)
|
indexing.update_llm_index(rebuild=True)
|
||||||
|
|
||||||
@@ -97,6 +98,7 @@ def test_update_llm_index_removes_meta(
|
|||||||
mock_queryset = MagicMock()
|
mock_queryset = MagicMock()
|
||||||
mock_queryset.exists.return_value = True
|
mock_queryset.exists.return_value = True
|
||||||
mock_queryset.__iter__.return_value = iter([real_document])
|
mock_queryset.__iter__.return_value = iter([real_document])
|
||||||
|
mock_queryset.count.return_value = 1
|
||||||
mock_all.return_value = mock_queryset
|
mock_all.return_value = mock_queryset
|
||||||
indexing.update_llm_index(rebuild=True)
|
indexing.update_llm_index(rebuild=True)
|
||||||
|
|
||||||
@@ -129,6 +131,7 @@ def test_update_llm_index_partial_update(
|
|||||||
mock_queryset = MagicMock()
|
mock_queryset = MagicMock()
|
||||||
mock_queryset.exists.return_value = True
|
mock_queryset.exists.return_value = True
|
||||||
mock_queryset.__iter__.return_value = iter([real_document, doc2])
|
mock_queryset.__iter__.return_value = iter([real_document, doc2])
|
||||||
|
mock_queryset.count.return_value = 2
|
||||||
mock_all.return_value = mock_queryset
|
mock_all.return_value = mock_queryset
|
||||||
|
|
||||||
indexing.update_llm_index(rebuild=True)
|
indexing.update_llm_index(rebuild=True)
|
||||||
@@ -149,6 +152,7 @@ def test_update_llm_index_partial_update(
|
|||||||
mock_queryset = MagicMock()
|
mock_queryset = MagicMock()
|
||||||
mock_queryset.exists.return_value = True
|
mock_queryset.exists.return_value = True
|
||||||
mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
|
mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
|
||||||
|
mock_queryset.count.return_value = 3
|
||||||
mock_all.return_value = mock_queryset
|
mock_all.return_value = mock_queryset
|
||||||
|
|
||||||
# assert logs "Updating LLM index with %d new nodes and removing %d old nodes."
|
# assert logs "Updating LLM index with %d new nodes and removing %d old nodes."
|
||||||
|
|||||||
16
uv.lock
generated
16
uv.lock
generated
@@ -3083,7 +3083,6 @@ dependencies = [
|
|||||||
{ name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
|
{ name = "torch", version = "2.10.0", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'darwin'" },
|
||||||
{ name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'linux'" },
|
{ name = "torch", version = "2.10.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'linux'" },
|
||||||
{ name = "tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
{ name = "watchfiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "watchfiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "whitenoise", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "whitenoise", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "whoosh-reloaded", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "whoosh-reloaded", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
@@ -3166,7 +3165,6 @@ typing = [
|
|||||||
{ name = "types-pytz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "types-pytz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "types-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "types-redis", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "types-setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "types-setuptools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "types-tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
@@ -3237,7 +3235,6 @@ requires-dist = [
|
|||||||
{ name = "setproctitle", specifier = "~=1.3.4" },
|
{ name = "setproctitle", specifier = "~=1.3.4" },
|
||||||
{ name = "tika-client", specifier = "~=0.10.0" },
|
{ name = "tika-client", specifier = "~=0.10.0" },
|
||||||
{ name = "torch", specifier = "~=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
|
{ name = "torch", specifier = "~=2.10.0", index = "https://download.pytorch.org/whl/cpu" },
|
||||||
{ name = "tqdm", specifier = "~=4.67.1" },
|
|
||||||
{ name = "watchfiles", specifier = ">=1.1.1" },
|
{ name = "watchfiles", specifier = ">=1.1.1" },
|
||||||
{ name = "whitenoise", specifier = "~=6.11" },
|
{ name = "whitenoise", specifier = "~=6.11" },
|
||||||
{ name = "whoosh-reloaded", specifier = ">=2.7.5" },
|
{ name = "whoosh-reloaded", specifier = ">=2.7.5" },
|
||||||
@@ -3304,7 +3301,6 @@ typing = [
|
|||||||
{ name = "types-pytz" },
|
{ name = "types-pytz" },
|
||||||
{ name = "types-redis" },
|
{ name = "types-redis" },
|
||||||
{ name = "types-setuptools" },
|
{ name = "types-setuptools" },
|
||||||
{ name = "types-tqdm" },
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -5584,18 +5580,6 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/2b/7f/016dc5cc718ec6ccaa84fb73ed409ef1c261793fd5e637cdfaa18beb40a9/types_setuptools-80.10.0.20260124-py3-none-any.whl", hash = "sha256:efed7e044f01adb9c2806c7a8e1b6aa3656b8e382379b53d5f26ee3db24d4c01", size = 64333, upload-time = "2026-01-24T03:18:38.344Z" },
|
{ url = "https://files.pythonhosted.org/packages/2b/7f/016dc5cc718ec6ccaa84fb73ed409ef1c261793fd5e637cdfaa18beb40a9/types_setuptools-80.10.0.20260124-py3-none-any.whl", hash = "sha256:efed7e044f01adb9c2806c7a8e1b6aa3656b8e382379b53d5f26ee3db24d4c01", size = 64333, upload-time = "2026-01-24T03:18:38.344Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "types-tqdm"
|
|
||||||
version = "4.67.3.20260205"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "types-requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
]
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/53/46/790b9872523a48163bdda87d47849b4466017640e5259d06eed539340afd/types_tqdm-4.67.3.20260205.tar.gz", hash = "sha256:f3023682d4aa3bbbf908c8c6bb35f35692d319460d9bbd3e646e8852f3dd9f85", size = 17597, upload-time = "2026-02-05T04:03:19.721Z" }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/cc/da/7f761868dbaa328392356fab30c18ab90d14cce86b269e7e63328f29d4a3/types_tqdm-4.67.3.20260205-py3-none-any.whl", hash = "sha256:85c31731e81dc3c5cecc34c6c8b2e5166fafa722468f58840c2b5ac6a8c5c173", size = 23894, upload-time = "2026-02-05T04:03:18.48Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "types-webencodings"
|
name = "types-webencodings"
|
||||||
version = "0.5.0.20251108"
|
version = "0.5.0.20251108"
|
||||||
|
|||||||
Reference in New Issue
Block a user