Add to handler, matching, retagger

This commit is contained in:
shamoon 2024-12-13 13:54:20 -08:00
parent a632b6b711
commit 6dc6c6c7bb
No known key found for this signature in database
5 changed files with 110 additions and 4 deletions

View File

@ -372,17 +372,19 @@ currently-imported docs. This problem is common enough that there are
tools for it. tools for it.
``` ```
document_retagger [-h] [-c] [-T] [-t] [-i] [--id-range] [--use-first] [-f] document_retagger [-h] [-c] [-T] [-t] [-cf] [-i] [--id-range] [--use-first] [-f] [--suggest]
optional arguments: optional arguments:
-c, --correspondent -c, --correspondent
-T, --tags -T, --tags
-t, --document_type -t, --document_type
-s, --storage_path -s, --storage_path
-cf, --custom_fields
-i, --inbox-only -i, --inbox-only
--id-range --id-range
--use-first --use-first
-f, --overwrite -f, --overwrite
--suggest
``` ```
Run this after changing or adding matching rules. It'll loop over all Run this after changing or adding matching rules. It'll loop over all
@ -408,6 +410,8 @@ to override this behavior and just use the first correspondent or type
it finds. This option does not apply to tags, since any amount of tags it finds. This option does not apply to tags, since any amount of tags
can be applied to a document. can be applied to a document.
If you want to suggest changes but not apply them, specify `--suggest`.
Finally, `-f` specifies that you wish to overwrite already assigned Finally, `-f` specifies that you wish to overwrite already assigned
correspondents, types and/or tags. The default behavior is to not assign correspondents, types and/or tags. The default behavior is to not assign
correspondents and types to documents that have this data already correspondents and types to documents that have this data already

View File

@ -15,6 +15,7 @@ class DocumentsConfig(AppConfig):
from documents.signals.handlers import run_workflows_added from documents.signals.handlers import run_workflows_added
from documents.signals.handlers import run_workflows_updated from documents.signals.handlers import run_workflows_updated
from documents.signals.handlers import set_correspondent from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_custom_fields
from documents.signals.handlers import set_document_type from documents.signals.handlers import set_document_type
from documents.signals.handlers import set_storage_path from documents.signals.handlers import set_storage_path
from documents.signals.handlers import set_tags from documents.signals.handlers import set_tags
@ -24,6 +25,7 @@ class DocumentsConfig(AppConfig):
document_consumption_finished.connect(set_document_type) document_consumption_finished.connect(set_document_type)
document_consumption_finished.connect(set_tags) document_consumption_finished.connect(set_tags)
document_consumption_finished.connect(set_storage_path) document_consumption_finished.connect(set_storage_path)
document_consumption_finished.connect(set_custom_fields)
document_consumption_finished.connect(add_to_index) document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_workflows_added) document_consumption_finished.connect(run_workflows_added)
document_updated.connect(run_workflows_updated) document_updated.connect(run_workflows_updated)

View File

@ -7,6 +7,7 @@ from documents.classifier import load_classifier
from documents.management.commands.mixins import ProgressBarMixin from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document from documents.models import Document
from documents.signals.handlers import set_correspondent from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_custom_fields
from documents.signals.handlers import set_document_type from documents.signals.handlers import set_document_type
from documents.signals.handlers import set_storage_path from documents.signals.handlers import set_storage_path
from documents.signals.handlers import set_tags from documents.signals.handlers import set_tags
@ -17,9 +18,9 @@ logger = logging.getLogger("paperless.management.retagger")
class Command(ProgressBarMixin, BaseCommand): class Command(ProgressBarMixin, BaseCommand):
help = ( help = (
"Using the current classification model, assigns correspondents, tags " "Using the current classification model, assigns correspondents, tags "
"and document types to all documents, effectively allowing you to " "document types, storage paths and custom fields to all documents, effectively"
"back-tag all previously indexed documents with metadata created (or " "allowing you to back-tag all previously indexed documents with metadata created "
"modified) after their initial import." "(or modified) after their initial import."
) )
def add_arguments(self, parser): def add_arguments(self, parser):
@ -27,6 +28,12 @@ class Command(ProgressBarMixin, BaseCommand):
parser.add_argument("-T", "--tags", default=False, action="store_true") parser.add_argument("-T", "--tags", default=False, action="store_true")
parser.add_argument("-t", "--document_type", default=False, action="store_true") parser.add_argument("-t", "--document_type", default=False, action="store_true")
parser.add_argument("-s", "--storage_path", default=False, action="store_true") parser.add_argument("-s", "--storage_path", default=False, action="store_true")
parser.add_argument(
"-cf",
"--custom_fields",
default=False,
action="store_true",
)
parser.add_argument("-i", "--inbox-only", default=False, action="store_true") parser.add_argument("-i", "--inbox-only", default=False, action="store_true")
parser.add_argument( parser.add_argument(
"--use-first", "--use-first",
@ -134,3 +141,16 @@ class Command(ProgressBarMixin, BaseCommand):
stdout=self.stdout, stdout=self.stdout,
style_func=self.style, style_func=self.style,
) )
if options["custom_fields"]:
set_custom_fields(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)

View File

@ -132,6 +132,25 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
) )
def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None):
predicted_custom_field_ids = (
classifier.predict_custom_fields(document.content) if classifier else []
)
fields = [instance.field for instance in document.custom_fields.all()]
return list(
filter(
lambda o: matches(o, document)
or (
o.matching_algorithm == MatchingModel.MATCH_AUTO
and o.pk in predicted_custom_field_ids
),
fields,
),
)
def matches(matching_model: MatchingModel, document: Document): def matches(matching_model: MatchingModel, document: Document):
search_kwargs = {} search_kwargs = {}

View File

@ -318,6 +318,67 @@ def set_storage_path(
document.save(update_fields=("storage_path",)) document.save(update_fields=("storage_path",))
def set_custom_fields(
document: Document,
logging_group=None,
classifier: DocumentClassifier | None = None,
replace=False,
suggest=False,
base_url=None,
stdout=None,
style_func=None,
**kwargs,
):
if replace:
CustomFieldInstance.objects.filter(document=document).exclude(
Q(field__match="") & ~Q(field__matching_algorithm=CustomField.MATCH_AUTO),
).delete()
current_fields = set([instance.field for instance in document.custom_fields.all()])
matched_fields = matching.match_custom_fields(document, classifier)
relevant_fields = set(matched_fields) - current_fields
if suggest:
extra_fields = current_fields - set(matched_fields)
extra_fields = [
f for f in extra_fields if f.matching_algorithm == MatchingModel.MATCH_AUTO
]
if not relevant_fields and not extra_fields:
return
doc_str = style_func.SUCCESS(str(document))
if base_url:
stdout.write(doc_str)
stdout.write(f"{base_url}/documents/{document.pk}")
else:
stdout.write(doc_str + style_func.SUCCESS(f" [{document.pk}]"))
if relevant_fields:
stdout.write(
"Suggest custom fields: "
+ ", ".join([f.name for f in relevant_fields]),
)
if extra_fields:
stdout.write(
"Extra custom fields: " + ", ".join([f.name for f in extra_fields]),
)
else:
if not relevant_fields:
return
message = 'Assigning custom fields "{}" to "{}"'
logger.info(
message.format(document, ", ".join([f.name for f in relevant_fields])),
extra={"group": logging_group},
)
for field in relevant_fields:
CustomFieldInstance.objects.create(
field=field,
document=document,
)
# see empty_trash in documents/tasks.py for signal handling # see empty_trash in documents/tasks.py for signal handling
def cleanup_document_deletion(sender, instance, **kwargs): def cleanup_document_deletion(sender, instance, **kwargs):
with FileLock(settings.MEDIA_LOCK): with FileLock(settings.MEDIA_LOCK):