Chore: Cleanup command arguments and standardize process count handling (#4541)

Cleans up some command help text and adds more control over process count for command with a Pool
This commit is contained in:
Trenton H 2023-11-09 11:46:37 -08:00 committed by GitHub
parent 577b49df9d
commit e8527ba723
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 229 additions and 198 deletions

View File

@ -414,6 +414,9 @@ This command takes no arguments.
Use this command to re-create document thumbnails. Optionally include the ` --document {id}` option to generate thumbnails for a specific document only.
You may also specify `--processes` to control the number of processes used to generate new thumbnails. The default is to utilize
a quarter of the available processors.
```
document_thumbnails
```
@ -591,7 +594,7 @@ take into account by the detection.
document_fuzzy_match [--ratio] [--processes N]
```
| Option | Required | Default | Description |
| ----------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------ |
| --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
| --processes | No | 4 | Number of processes to use for matching. Setting 1 disables multiple processes |
| Option | Required | Default | Description |
| ----------- | -------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
| --ratio | No | 85.0 | a number between 0 and 100, setting how similar a document must be for it to be reported. Higher numbers mean more similarity. |
| --processes | No | 1/4 of system cores | Number of processes to use for matching. Setting 1 disables multiple processes |

View File

@ -17,19 +17,27 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument(
"--passphrase",
help="If PAPERLESS_PASSPHRASE isn't set already, you need to "
"specify it here",
help=(
"If PAPERLESS_PASSPHRASE isn't set already, you need to "
"specify it here"
),
)
def handle(self, *args, **options):
try:
print(
"\n\nWARNING: This script is going to work directly on your "
"document originals, so\nWARNING: you probably shouldn't run "
"this unless you've got a recent backup\nWARNING: handy. It "
"*should* work without a hitch, but be safe and backup your\n"
"WARNING: stuff first.\n\nHit Ctrl+C to exit now, or Enter to "
"continue.\n\n",
self.stdout.write(
self.style.WARNING(
"\n\n"
"WARNING: This script is going to work directly on your "
"document originals, so\n"
"WARNING: you probably shouldn't run "
"this unless you've got a recent backup\n"
"WARNING: handy. It "
"*should* work without a hitch, but be safe and backup your\n"
"WARNING: stuff first.\n\n"
"Hit Ctrl+C to exit now, or Enter to "
"continue.\n\n",
),
)
_ = input()
except KeyboardInterrupt:
@ -44,14 +52,13 @@ class Command(BaseCommand):
self.__gpg_to_unencrypted(passphrase)
@staticmethod
def __gpg_to_unencrypted(passphrase):
def __gpg_to_unencrypted(self, passphrase: str):
encrypted_files = Document.objects.filter(
storage_type=Document.STORAGE_TYPE_GPG,
)
for document in encrypted_files:
print(f"Decrypting {document}".encode())
self.stdout.write(f"Decrypting {document}")
old_paths = [document.source_path, document.thumbnail_path]

View File

@ -7,21 +7,20 @@ from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.tasks import update_document_archive_file
logger = logging.getLogger("paperless.management.archiver")
class Command(BaseCommand):
help = """
Using the current classification model, assigns correspondents, tags
and document types to all documents, effectively allowing you to
back-tag all previously indexed documents with metadata created (or
modified) after their initial import.
""".replace(
" ",
"",
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = (
"Using the current classification model, assigns correspondents, tags "
"and document types to all documents, effectively allowing you to "
"back-tag all previously indexed documents with metadata created (or "
"modified) after their initial import."
)
def add_arguments(self, parser):
@ -30,8 +29,10 @@ class Command(BaseCommand):
"--overwrite",
default=False,
action="store_true",
help="Recreates the archived document for documents that already "
"have an archived version.",
help=(
"Recreates the archived document for documents that already "
"have an archived version."
),
)
parser.add_argument(
"-d",
@ -39,17 +40,18 @@ class Command(BaseCommand):
default=None,
type=int,
required=False,
help="Specify the ID of a document, and this command will only "
"run on this specific document.",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
help=(
"Specify the ID of a document, and this command will only "
"run on this specific document."
),
)
self.add_argument_progress_bar_mixin(parser)
self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
overwrite = options["overwrite"]
@ -67,19 +69,27 @@ class Command(BaseCommand):
)
# Note to future self: this prevents django from reusing database
# conncetions between processes, which is bad and does not work
# connections between processes, which is bad and does not work
# with postgres.
db.connections.close_all()
try:
logging.getLogger().handlers[0].level = logging.ERROR
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(update_document_archive_file, document_ids),
total=len(document_ids),
disable=options["no_progress_bar"],
),
)
if self.process_count == 1:
for doc_id in document_ids:
update_document_archive_file(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(
update_document_archive_file,
document_ids,
),
total=len(document_ids),
disable=self.no_progress_bar,
),
)
except KeyboardInterrupt:
self.stdout.write(self.style.NOTICE("Aborting..."))

View File

@ -4,16 +4,10 @@ from documents.tasks import train_classifier
class Command(BaseCommand):
help = """
Trains the classifier on your data and saves the resulting models to a
file. The document consumer will then automatically use this new model.
""".replace(
" ",
"",
help = (
"Trains the classifier on your data and saves the resulting models to a "
"file. The document consumer will then automatically use this new model."
)
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
train_classifier()

View File

@ -43,13 +43,10 @@ from paperless_mail.models import MailRule
class Command(BaseCommand):
help = """
Decrypt and rename all files in our collection into a given target
directory. And include a manifest file containing document data for
easy import.
""".replace(
" ",
"",
help = (
"Decrypt and rename all files in our collection into a given target "
"directory. And include a manifest file containing document data for "
"easy import."
)
def add_arguments(self, parser):
@ -60,9 +57,11 @@ class Command(BaseCommand):
"--compare-checksums",
default=False,
action="store_true",
help="Compare file checksums when determining whether to export "
"a file or not. If not specified, file size and time "
"modified is used instead.",
help=(
"Compare file checksums when determining whether to export "
"a file or not. If not specified, file size and time "
"modified is used instead."
),
)
parser.add_argument(
@ -70,9 +69,11 @@ class Command(BaseCommand):
"--delete",
default=False,
action="store_true",
help="After exporting, delete files in the export directory that "
"do not belong to the current export, such as files from "
"deleted documents.",
help=(
"After exporting, delete files in the export directory that "
"do not belong to the current export, such as files from "
"deleted documents."
),
)
parser.add_argument(
@ -80,8 +81,10 @@ class Command(BaseCommand):
"--use-filename-format",
default=False,
action="store_true",
help="Use PAPERLESS_FILENAME_FORMAT for storing files in the "
"export directory, if configured.",
help=(
"Use PAPERLESS_FILENAME_FORMAT for storing files in the "
"export directory, if configured."
),
)
parser.add_argument(
@ -105,8 +108,10 @@ class Command(BaseCommand):
"--use-folder-prefix",
default=False,
action="store_true",
help="Export files in dedicated folders according to their nature: "
"archive, originals or thumbnails",
help=(
"Export files in dedicated folders according to their nature: "
"archive, originals or thumbnails"
),
)
parser.add_argument(

View File

@ -7,6 +7,8 @@ import tqdm
from django.core.management import BaseCommand
from django.core.management import CommandError
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
@ -41,7 +43,7 @@ def _process_and_match(work: _WorkPackage) -> _WorkResult:
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
class Command(BaseCommand):
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = "Searches for documents where the content almost matches"
def add_arguments(self, parser):
@ -51,23 +53,16 @@ class Command(BaseCommand):
type=float,
help="Ratio to consider documents a match",
)
parser.add_argument(
"--processes",
default=4,
type=int,
help="Number of processes to distribute work amongst",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
self.add_argument_progress_bar_mixin(parser)
self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
@ -76,9 +71,6 @@ class Command(BaseCommand):
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
if options["processes"] < 1:
raise CommandError("There must be at least 1 process")
all_docs = Document.objects.all().order_by("id")
# Build work packages for processing
@ -101,17 +93,17 @@ class Command(BaseCommand):
work_pkgs.append(_WorkPackage(first_doc, second_doc))
# Don't spin up a pool of 1 process
if options["processes"] == 1:
if self.process_count == 1:
results = []
for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]):
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
results.append(_process_and_match(work))
else:
with multiprocessing.Pool(processes=options["processes"]) as pool:
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
results = list(
tqdm.tqdm(
pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs),
disable=options["no_progress_bar"],
disable=self.no_progress_bar,
),
)

View File

@ -40,12 +40,9 @@ def disable_signal(sig, receiver, sender):
class Command(BaseCommand):
help = """
Using a manifest.json file, load the data from there, and import the
documents it refers to.
""".replace(
" ",
"",
help = (
"Using a manifest.json file, load the data from there, and import the "
"documents it refers to."
)
def add_arguments(self, parser):

View File

@ -1,25 +1,22 @@
from django.core.management import BaseCommand
from django.db import transaction
from documents.management.commands.mixins import ProgressBarMixin
from documents.tasks import index_optimize
from documents.tasks import index_reindex
class Command(BaseCommand):
class Command(ProgressBarMixin, BaseCommand):
help = "Manages the document index."
def add_arguments(self, parser):
parser.add_argument("command", choices=["reindex", "optimize"])
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
with transaction.atomic():
if options["command"] == "reindex":
index_reindex(progress_bar_disable=options["no_progress_bar"])
index_reindex(progress_bar_disable=self.no_progress_bar)
elif options["command"] == "optimize":
index_optimize()

View File

@ -4,30 +4,22 @@ import tqdm
from django.core.management.base import BaseCommand
from django.db.models.signals import post_save
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(
" ",
"",
)
class Command(ProgressBarMixin, BaseCommand):
help = "This will rename all documents to match the latest filename format."
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm(
Document.objects.all(),
disable=options["no_progress_bar"],
disable=self.no_progress_bar,
):
post_save.send(Document, instance=document)

View File

@ -4,6 +4,7 @@ import tqdm
from django.core.management.base import BaseCommand
from documents.classifier import load_classifier
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_document_type
@ -13,15 +14,12 @@ from documents.signals.handlers import set_tags
logger = logging.getLogger("paperless.management.retagger")
class Command(BaseCommand):
help = """
Using the current classification model, assigns correspondents, tags
and document types to all documents, effectively allowing you to
back-tag all previously indexed documents with metadata created (or
modified) after their initial import.
""".replace(
" ",
"",
class Command(ProgressBarMixin, BaseCommand):
help = (
"Using the current classification model, assigns correspondents, tags "
"and document types to all documents, effectively allowing you to "
"back-tag all previously indexed documents with metadata created (or "
"modified) after their initial import."
)
def add_arguments(self, parser):
@ -34,25 +32,24 @@ class Command(BaseCommand):
"--use-first",
default=False,
action="store_true",
help="By default this command won't try to assign a correspondent "
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds.",
help=(
"By default this command won't try to assign a correspondent "
"if more than one matches the document. Use this flag if "
"you'd rather it just pick the first one it finds."
),
)
parser.add_argument(
"-f",
"--overwrite",
default=False,
action="store_true",
help="If set, the document retagger will overwrite any previously"
"set correspondent, document and remove correspondents, types"
"and tags that do not match anymore due to changed rules.",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
help=(
"If set, the document retagger will overwrite any previously"
"set correspondent, document and remove correspondents, types"
"and tags that do not match anymore due to changed rules."
),
)
self.add_argument_progress_bar_mixin(parser)
parser.add_argument(
"--suggest",
default=False,
@ -71,6 +68,7 @@ class Command(BaseCommand):
)
def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
# Detect if we support color
color = self.style.ERROR("test") != "test"
@ -88,7 +86,7 @@ class Command(BaseCommand):
classifier = load_classifier()
for document in tqdm.tqdm(documents, disable=options["no_progress_bar"]):
for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
if options["correspondent"]:
set_correspondent(
sender=None,

View File

@ -1,25 +1,17 @@
from django.core.management.base import BaseCommand
from documents.management.commands.mixins import ProgressBarMixin
from documents.sanity_checker import check_sanity
class Command(BaseCommand):
help = """
This command checks your document archive for issues.
""".replace(
" ",
"",
)
class Command(ProgressBarMixin, BaseCommand):
help = "This command checks your document archive for issues."
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
self.add_argument_progress_bar_mixin(parser)
def handle(self, *args, **options):
messages = check_sanity(progress=not options["no_progress_bar"])
self.handle_progress_bar_mixin(**options)
messages = check_sanity(progress=self.use_progress_bar)
messages.log_messages()

View File

@ -6,6 +6,8 @@ import tqdm
from django import db
from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.parsers import get_parser_class_for_mime_type
@ -32,13 +34,8 @@ def _process_document(doc_id):
parser.cleanup()
class Command(BaseCommand):
help = """
This will regenerate the thumbnails for all documents.
""".replace(
" ",
"",
)
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = "This will regenerate the thumbnails for all documents."
def add_arguments(self, parser):
parser.add_argument(
@ -47,19 +44,20 @@ class Command(BaseCommand):
default=None,
type=int,
required=False,
help="Specify the ID of a document, and this command will only "
"run on this specific document.",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
help=(
"Specify the ID of a document, and this command will only "
"run on this specific document."
),
)
self.add_argument_progress_bar_mixin(parser)
self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
logging.getLogger().handlers[0].level = logging.ERROR
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
if options["document"]:
documents = Document.objects.filter(pk=options["document"])
else:
@ -72,11 +70,15 @@ class Command(BaseCommand):
# with postgres.
db.connections.close_all()
with multiprocessing.Pool() as pool:
list(
tqdm.tqdm(
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=options["no_progress_bar"],
),
)
if self.process_count == 1:
for doc_id in ids:
_process_document(doc_id)
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(_process_document, ids),
total=len(ids),
disable=self.no_progress_bar,
),
)

View File

@ -1,5 +1,6 @@
import logging
import os
from argparse import RawTextHelpFormatter
from django.contrib.auth.models import User
from django.core.management.base import BaseCommand
@ -8,20 +9,22 @@ logger = logging.getLogger("paperless.management.superuser")
class Command(BaseCommand):
help = """
Creates a Django superuser:
User named: admin
Email: root@localhost
with password based on env variable.
No superuser will be created, when:
- The username is taken already exists
- A superuser already exists
- PAPERLESS_ADMIN_PASSWORD is not set
""".replace(
" ",
"",
help = (
"Creates a Django superuser:\n"
" User named: admin\n"
" Email: root@localhost\n"
" Password: based on env variable PAPERLESS_ADMIN_PASSWORD\n"
"No superuser will be created, when:\n"
" - The username is taken already exists\n"
" - A superuser already exists\n"
" - PAPERLESS_ADMIN_PASSWORD is not set"
)
def create_parser(self, *args, **kwargs):
parser = super().create_parser(*args, **kwargs)
parser.formatter_class = RawTextHelpFormatter
return parser
def handle(self, *args, **options):
username = os.getenv("PAPERLESS_ADMIN_USER", "admin")
mail = os.getenv("PAPERLESS_ADMIN_MAIL", "root@localhost")

View File

@ -0,0 +1,43 @@
import os
from argparse import ArgumentParser
from django.core.management import CommandError
class MultiProcessMixin:
"""
Small class to handle adding an argument and validating it
for the use of multiple processes
"""
def add_argument_processes_mixin(self, parser: ArgumentParser):
parser.add_argument(
"--processes",
default=max(1, os.cpu_count() // 4),
type=int,
help="Number of processes to distribute work amongst",
)
def handle_processes_mixin(self, *args, **options):
self.process_count = options["processes"]
if self.process_count < 1:
raise CommandError("There must be at least 1 process")
class ProgressBarMixin:
"""
Many commands use a progress bar, which can be disabled
via this class
"""
def add_argument_progress_bar_mixin(self, parser: ArgumentParser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle_progress_bar_mixin(self, *args, **options):
self.no_progress_bar = options["no_progress_bar"]
self.use_progress_bar = not self.no_progress_bar

View File

@ -36,7 +36,7 @@ class TestArchiver(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
)
call_command("document_archiver")
call_command("document_archiver", "--processes", "1")
def test_handle_document(self):
doc = self.make_models()

View File

@ -83,13 +83,13 @@ class TestMakeThumbnails(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_command(self):
self.assertIsNotFile(self.d1.thumbnail_path)
self.assertIsNotFile(self.d2.thumbnail_path)
call_command("document_thumbnails")
call_command("document_thumbnails", "--processes", "1")
self.assertIsFile(self.d1.thumbnail_path)
self.assertIsFile(self.d2.thumbnail_path)
def test_command_documentid(self):
self.assertIsNotFile(self.d1.thumbnail_path)
self.assertIsNotFile(self.d2.thumbnail_path)
call_command("document_thumbnails", "-d", f"{self.d1.id}")
call_command("document_thumbnails", "--processes", "1", "-d", f"{self.d1.id}")
self.assertIsFile(self.d1.thumbnail_path)
self.assertIsNotFile(self.d2.thumbnail_path)

View File

@ -4,11 +4,7 @@ from paperless_mail import tasks
class Command(BaseCommand):
help = """
""".replace(
" ",
"",
)
help = "Manually triggers a fetching and processing of all mail accounts"
def handle(self, *args, **options):
tasks.process_mail_accounts()