From b238ba054d644efb510f948b468820f4ee9e1fee Mon Sep 17 00:00:00 2001 From: Kamil Kosek Date: Fri, 8 Sep 2023 19:33:24 +0200 Subject: [PATCH] Enhancement: add --id-range for document_retagger (#4080) --------- Co-authored-by: Trenton H <797416+stumpylog@users.noreply.github.com> --- docs/administration.md | 8 ++++- .../management/commands/document_retagger.py | 12 +++++++ .../tests/test_management_retagger.py | 36 +++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/docs/administration.md b/docs/administration.md index 627aa4136..2003edec9 100644 --- a/docs/administration.md +++ b/docs/administration.md @@ -351,7 +351,7 @@ currently-imported docs. This problem is common enough that there are tools for it. ``` -document_retagger [-h] [-c] [-T] [-t] [-i] [--use-first] [-f] +document_retagger [-h] [-c] [-T] [-t] [-i] [--id-range] [--use-first] [-f] optional arguments: -c, --correspondent @@ -359,6 +359,7 @@ optional arguments: -t, --document_type -s, --storage_path -i, --inbox-only +--id-range --use-first -f, --overwrite ``` @@ -375,6 +376,11 @@ Specify `-i` to have the document retagger work on documents tagged with inbox tags only. This is useful when you don't want to mess with your already processed documents. +Specify `--id-range 1 100` to have the document retagger work only on a +specific range of document id´s. This can be useful if you have a lot of +documents and want to test the matching rules only on a subset of +documents. + When multiple document types or correspondents match a single document, the retagger won't assign these to the document. Specify `--use-first` to override this behavior and just use the first correspondent or type diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index e67d6aed0..385cbf608 100644 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -63,6 +63,12 @@ class Command(BaseCommand): "--base-url", help="The base URL to use to build the link to the documents.", ) + parser.add_argument( + "--id-range", + help="A range of document ids on which the retagging should be applied.", + nargs=2, + type=int, + ) def handle(self, *args, **options): # Detect if we support color @@ -72,6 +78,12 @@ class Command(BaseCommand): queryset = Document.objects.filter(tags__is_inbox_tag=True) else: queryset = Document.objects.all() + + if options["id_range"]: + queryset = queryset.filter( + id__range=(options["id_range"][0], options["id_range"][1]), + ) + documents = queryset.distinct() classifier = load_classifier() diff --git a/src/documents/tests/test_management_retagger.py b/src/documents/tests/test_management_retagger.py index 75e5f7dee..eb65afb42 100644 --- a/src/documents/tests/test_management_retagger.py +++ b/src/documents/tests/test_management_retagger.py @@ -1,4 +1,5 @@ from django.core.management import call_command +from django.core.management.base import CommandError from django.test import TestCase from documents.models import Correspondent @@ -258,3 +259,38 @@ class TestRetagger(DirectoriesMixin, TestCase): self.assertEqual(d_auto.storage_path, self.sp1) self.assertIsNone(d_second.storage_path) self.assertEqual(d_unrelated.storage_path, self.sp2) + + def test_id_range_parameter(self): + commandOutput = "" + Document.objects.create( + checksum="E", + title="E", + content="NOT the first document", + ) + call_command("document_retagger", "--tags", "--id-range", "1", "2") + # The retagger shouldn`t apply the 'first' tag to our new document + self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 1) + + try: + commandOutput = call_command("document_retagger", "--tags", "--id-range") + except CommandError: + # Just ignore the error + None + self.assertIn(commandOutput, "Error: argument --id-range: expected 2 arguments") + + try: + commandOutput = call_command( + "document_retagger", + "--tags", + "--id-range", + "a", + "b", + ) + except CommandError: + # Just ignore the error + None + self.assertIn(commandOutput, "error: argument --id-range: invalid int value:") + + call_command("document_retagger", "--tags", "--id-range", "1", "9999") + # Now we should have 2 documents + self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 2)