Enhancement: add --id-range for document_retagger (#4080)

---------

Co-authored-by: Trenton H <797416+stumpylog@users.noreply.github.com>
This commit is contained in:
Kamil Kosek 2023-09-08 19:33:24 +02:00 committed by GitHub
parent a8e13df249
commit b238ba054d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 55 additions and 1 deletions

View File

@ -351,7 +351,7 @@ currently-imported docs. This problem is common enough that there are
tools for it. tools for it.
``` ```
document_retagger [-h] [-c] [-T] [-t] [-i] [--use-first] [-f] document_retagger [-h] [-c] [-T] [-t] [-i] [--id-range] [--use-first] [-f]
optional arguments: optional arguments:
-c, --correspondent -c, --correspondent
@ -359,6 +359,7 @@ optional arguments:
-t, --document_type -t, --document_type
-s, --storage_path -s, --storage_path
-i, --inbox-only -i, --inbox-only
--id-range
--use-first --use-first
-f, --overwrite -f, --overwrite
``` ```
@ -375,6 +376,11 @@ Specify `-i` to have the document retagger work on documents tagged with
inbox tags only. This is useful when you don't want to mess with your inbox tags only. This is useful when you don't want to mess with your
already processed documents. already processed documents.
Specify `--id-range 1 100` to have the document retagger work only on a
specific range of document id´s. This can be useful if you have a lot of
documents and want to test the matching rules only on a subset of
documents.
When multiple document types or correspondents match a single document, When multiple document types or correspondents match a single document,
the retagger won't assign these to the document. Specify `--use-first` the retagger won't assign these to the document. Specify `--use-first`
to override this behavior and just use the first correspondent or type to override this behavior and just use the first correspondent or type

View File

@ -63,6 +63,12 @@ class Command(BaseCommand):
"--base-url", "--base-url",
help="The base URL to use to build the link to the documents.", help="The base URL to use to build the link to the documents.",
) )
parser.add_argument(
"--id-range",
help="A range of document ids on which the retagging should be applied.",
nargs=2,
type=int,
)
def handle(self, *args, **options): def handle(self, *args, **options):
# Detect if we support color # Detect if we support color
@ -72,6 +78,12 @@ class Command(BaseCommand):
queryset = Document.objects.filter(tags__is_inbox_tag=True) queryset = Document.objects.filter(tags__is_inbox_tag=True)
else: else:
queryset = Document.objects.all() queryset = Document.objects.all()
if options["id_range"]:
queryset = queryset.filter(
id__range=(options["id_range"][0], options["id_range"][1]),
)
documents = queryset.distinct() documents = queryset.distinct()
classifier = load_classifier() classifier = load_classifier()

View File

@ -1,4 +1,5 @@
from django.core.management import call_command from django.core.management import call_command
from django.core.management.base import CommandError
from django.test import TestCase from django.test import TestCase
from documents.models import Correspondent from documents.models import Correspondent
@ -258,3 +259,38 @@ class TestRetagger(DirectoriesMixin, TestCase):
self.assertEqual(d_auto.storage_path, self.sp1) self.assertEqual(d_auto.storage_path, self.sp1)
self.assertIsNone(d_second.storage_path) self.assertIsNone(d_second.storage_path)
self.assertEqual(d_unrelated.storage_path, self.sp2) self.assertEqual(d_unrelated.storage_path, self.sp2)
def test_id_range_parameter(self):
commandOutput = ""
Document.objects.create(
checksum="E",
title="E",
content="NOT the first document",
)
call_command("document_retagger", "--tags", "--id-range", "1", "2")
# The retagger shouldn`t apply the 'first' tag to our new document
self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 1)
try:
commandOutput = call_command("document_retagger", "--tags", "--id-range")
except CommandError:
# Just ignore the error
None
self.assertIn(commandOutput, "Error: argument --id-range: expected 2 arguments")
try:
commandOutput = call_command(
"document_retagger",
"--tags",
"--id-range",
"a",
"b",
)
except CommandError:
# Just ignore the error
None
self.assertIn(commandOutput, "error: argument --id-range: invalid int value:")
call_command("document_retagger", "--tags", "--id-range", "1", "9999")
# Now we should have 2 documents
self.assertEqual(Document.objects.filter(tags__id=self.tag_first.id).count(), 2)