Chore: Cleanup command arguments and standardize process count handling (#4541)

Cleans up some command help text and adds more control over process count for command with a Pool
This commit is contained in:
Trenton H
2023-11-09 11:46:37 -08:00
committed by GitHub
parent 577b49df9d
commit e8527ba723
17 changed files with 229 additions and 198 deletions

View File

@@ -7,6 +7,8 @@ import tqdm
from django.core.management import BaseCommand
from django.core.management import CommandError
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
@@ -41,7 +43,7 @@ def _process_and_match(work: _WorkPackage) -> _WorkResult:
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
class Command(BaseCommand):
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = "Searches for documents where the content almost matches"
def add_arguments(self, parser):
@@ -51,23 +53,16 @@ class Command(BaseCommand):
type=float,
help="Ratio to consider documents a match",
)
parser.add_argument(
"--processes",
default=4,
type=int,
help="Number of processes to distribute work amongst",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
self.add_argument_progress_bar_mixin(parser)
self.add_argument_processes_mixin(parser)
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
@@ -76,9 +71,6 @@ class Command(BaseCommand):
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
if options["processes"] < 1:
raise CommandError("There must be at least 1 process")
all_docs = Document.objects.all().order_by("id")
# Build work packages for processing
@@ -101,17 +93,17 @@ class Command(BaseCommand):
work_pkgs.append(_WorkPackage(first_doc, second_doc))
# Don't spin up a pool of 1 process
if options["processes"] == 1:
if self.process_count == 1:
results = []
for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]):
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
results.append(_process_and_match(work))
else:
with multiprocessing.Pool(processes=options["processes"]) as pool:
else: # pragma: no cover
with multiprocessing.Pool(processes=self.process_count) as pool:
results = list(
tqdm.tqdm(
pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs),
disable=options["no_progress_bar"],
disable=self.no_progress_bar,
),
)