From 6dc6c6c7bbf598d1f4763b91192d7900c74191aa Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:54:20 -0800 Subject: [PATCH] Add to handler, matching, retagger --- docs/administration.md | 6 +- src/documents/apps.py | 2 + .../management/commands/document_retagger.py | 26 +++++++- src/documents/matching.py | 19 ++++++ src/documents/signals/handlers.py | 61 +++++++++++++++++++ 5 files changed, 110 insertions(+), 4 deletions(-) diff --git a/docs/administration.md b/docs/administration.md index 8e646b326..63333dee9 100644 --- a/docs/administration.md +++ b/docs/administration.md @@ -372,17 +372,19 @@ currently-imported docs. This problem is common enough that there are tools for it. ``` -document_retagger [-h] [-c] [-T] [-t] [-i] [--id-range] [--use-first] [-f] +document_retagger [-h] [-c] [-T] [-t] [-cf] [-i] [--id-range] [--use-first] [-f] [--suggest] optional arguments: -c, --correspondent -T, --tags -t, --document_type -s, --storage_path +-cf, --custom_fields -i, --inbox-only --id-range --use-first -f, --overwrite +--suggest ``` Run this after changing or adding matching rules. It'll loop over all @@ -408,6 +410,8 @@ to override this behavior and just use the first correspondent or type it finds. This option does not apply to tags, since any amount of tags can be applied to a document. +If you want to suggest changes but not apply them, specify `--suggest`. + Finally, `-f` specifies that you wish to overwrite already assigned correspondents, types and/or tags. The default behavior is to not assign correspondents and types to documents that have this data already diff --git a/src/documents/apps.py b/src/documents/apps.py index f3b798c0b..812c5d2a4 100644 --- a/src/documents/apps.py +++ b/src/documents/apps.py @@ -15,6 +15,7 @@ class DocumentsConfig(AppConfig): from documents.signals.handlers import run_workflows_added from documents.signals.handlers import run_workflows_updated from documents.signals.handlers import set_correspondent + from documents.signals.handlers import set_custom_fields from documents.signals.handlers import set_document_type from documents.signals.handlers import set_storage_path from documents.signals.handlers import set_tags @@ -24,6 +25,7 @@ class DocumentsConfig(AppConfig): document_consumption_finished.connect(set_document_type) document_consumption_finished.connect(set_tags) document_consumption_finished.connect(set_storage_path) + document_consumption_finished.connect(set_custom_fields) document_consumption_finished.connect(add_to_index) document_consumption_finished.connect(run_workflows_added) document_updated.connect(run_workflows_updated) diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py index 10bb54b71..7f73366d4 100644 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -7,6 +7,7 @@ from documents.classifier import load_classifier from documents.management.commands.mixins import ProgressBarMixin from documents.models import Document from documents.signals.handlers import set_correspondent +from documents.signals.handlers import set_custom_fields from documents.signals.handlers import set_document_type from documents.signals.handlers import set_storage_path from documents.signals.handlers import set_tags @@ -17,9 +18,9 @@ logger = logging.getLogger("paperless.management.retagger") class Command(ProgressBarMixin, BaseCommand): help = ( "Using the current classification model, assigns correspondents, tags " - "and document types to all documents, effectively allowing you to " - "back-tag all previously indexed documents with metadata created (or " - "modified) after their initial import." + "document types, storage paths and custom fields to all documents, effectively" + "allowing you to back-tag all previously indexed documents with metadata created " + "(or modified) after their initial import." ) def add_arguments(self, parser): @@ -27,6 +28,12 @@ class Command(ProgressBarMixin, BaseCommand): parser.add_argument("-T", "--tags", default=False, action="store_true") parser.add_argument("-t", "--document_type", default=False, action="store_true") parser.add_argument("-s", "--storage_path", default=False, action="store_true") + parser.add_argument( + "-cf", + "--custom_fields", + default=False, + action="store_true", + ) parser.add_argument("-i", "--inbox-only", default=False, action="store_true") parser.add_argument( "--use-first", @@ -134,3 +141,16 @@ class Command(ProgressBarMixin, BaseCommand): stdout=self.stdout, style_func=self.style, ) + + if options["custom_fields"]: + set_custom_fields( + sender=None, + document=document, + classifier=classifier, + replace=options["overwrite"], + use_first=options["use_first"], + suggest=options["suggest"], + base_url=options["base_url"], + stdout=self.stdout, + style_func=self.style, + ) diff --git a/src/documents/matching.py b/src/documents/matching.py index ab3866518..08cb5da77 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -132,6 +132,25 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user ) +def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None): + predicted_custom_field_ids = ( + classifier.predict_custom_fields(document.content) if classifier else [] + ) + + fields = [instance.field for instance in document.custom_fields.all()] + + return list( + filter( + lambda o: matches(o, document) + or ( + o.matching_algorithm == MatchingModel.MATCH_AUTO + and o.pk in predicted_custom_field_ids + ), + fields, + ), + ) + + def matches(matching_model: MatchingModel, document: Document): search_kwargs = {} diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 407735375..da54f456e 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -318,6 +318,67 @@ def set_storage_path( document.save(update_fields=("storage_path",)) +def set_custom_fields( + document: Document, + logging_group=None, + classifier: DocumentClassifier | None = None, + replace=False, + suggest=False, + base_url=None, + stdout=None, + style_func=None, + **kwargs, +): + if replace: + CustomFieldInstance.objects.filter(document=document).exclude( + Q(field__match="") & ~Q(field__matching_algorithm=CustomField.MATCH_AUTO), + ).delete() + + current_fields = set([instance.field for instance in document.custom_fields.all()]) + + matched_fields = matching.match_custom_fields(document, classifier) + + relevant_fields = set(matched_fields) - current_fields + + if suggest: + extra_fields = current_fields - set(matched_fields) + extra_fields = [ + f for f in extra_fields if f.matching_algorithm == MatchingModel.MATCH_AUTO + ] + if not relevant_fields and not extra_fields: + return + doc_str = style_func.SUCCESS(str(document)) + if base_url: + stdout.write(doc_str) + stdout.write(f"{base_url}/documents/{document.pk}") + else: + stdout.write(doc_str + style_func.SUCCESS(f" [{document.pk}]")) + if relevant_fields: + stdout.write( + "Suggest custom fields: " + + ", ".join([f.name for f in relevant_fields]), + ) + if extra_fields: + stdout.write( + "Extra custom fields: " + ", ".join([f.name for f in extra_fields]), + ) + else: + if not relevant_fields: + return + + message = 'Assigning custom fields "{}" to "{}"' + logger.info( + message.format(document, ", ".join([f.name for f in relevant_fields])), + extra={"group": logging_group}, + ) + + for field in relevant_fields: + CustomFieldInstance.objects.create( + field=field, + document=document, + ) + + # see empty_trash in documents/tasks.py for signal handling def cleanup_document_deletion(sender, instance, **kwargs): with FileLock(settings.MEDIA_LOCK):