From 7b753338198c7d8ce386844c634713a69baaaf06 Mon Sep 17 00:00:00 2001 From: shamoon <4887959+shamoon@users.noreply.github.com> Date: Thu, 20 Mar 2025 23:04:07 -0700 Subject: [PATCH] custom field regex matching [ci skip] --- src/documents/classifier.py | 7 ++++- src/documents/matching.py | 47 +++++++++++++++++++++++-------- src/documents/signals/handlers.py | 20 +++++++++---- 3 files changed, 57 insertions(+), 17 deletions(-) diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 58c2058b5..c577790f2 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -526,7 +526,12 @@ class DocumentClassifier: else: return None - def predict_custom_fields(self, content: str) -> list[int]: + def predict_custom_fields(self, content: str) -> dict: + """ + Custom fields are a bit different from the other classifiers, as we + need to predict the values for the fields, not just the field itself. + """ + # TODO: can this return the value? from sklearn.utils.multiclass import type_of_target if self.custom_fields_classifier: diff --git a/src/documents/matching.py b/src/documents/matching.py index 08cb5da77..d09ae8074 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -132,23 +132,48 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user ) -def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None): +def match_custom_fields( + document: Document, + classifier: DocumentClassifier, + user=None, +) -> dict: + """ + Custom fields work differently, we need the values for the match as well. + """ + # TODO: this needs to return values as well predicted_custom_field_ids = ( classifier.predict_custom_fields(document.content) if classifier else [] ) fields = [instance.field for instance in document.custom_fields.all()] - return list( - filter( - lambda o: matches(o, document) - or ( - o.matching_algorithm == MatchingModel.MATCH_AUTO - and o.pk in predicted_custom_field_ids - ), - fields, - ), - ) + matched_fields = {} + for field in fields: + if field.matching_algorithm == MatchingModel.MATCH_AUTO: + if field.pk in predicted_custom_field_ids: + matched_fields[field] = None + elif field.matching_algorithm == MatchingModel.MATCH_REGEX: + try: + match = re.search( + re.compile(field.matching_model.match), + document.content, + ) + if match: + matched_fields[field] = match.group() + except re.error: + logger.error( + f"Error while processing regular expression {field.matching_model.match}", + ) + return False + if match: + log_reason( + field.matching_model, + document, + f"the string {match.group()} matches the regular expression " + f"{field.matching_model.match}", + ) + + return matched_fields def matches(matching_model: MatchingModel, document: Document): diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index da54f456e..c9fc90650 100644 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -322,11 +322,12 @@ def set_custom_fields( document: Document, logging_group=None, classifier: DocumentClassifier | None = None, - replace=False, - suggest=False, base_url=None, stdout=None, style_func=None, + *, + replace=False, + suggest=False, **kwargs, ): if replace: @@ -336,7 +337,8 @@ def set_custom_fields( current_fields = set([instance.field for instance in document.custom_fields.all()]) - matched_fields = matching.match_custom_fields(document, classifier) + matched_fields_w_values: dict = matching.match_custom_fields(document, classifier) + matched_fields = matched_fields_w_values.keys() relevant_fields = set(matched_fields) - current_fields @@ -373,9 +375,17 @@ def set_custom_fields( ) for field in relevant_fields: + args = { + "field": field, + "document": document, + } + if field.pk in matched_fields_w_values: + value_field_name = CustomFieldInstance.get_value_field_name( + data_type=field.data_type, + ) + args[value_field_name] = matched_fields_w_values[field.pk] CustomFieldInstance.objects.create( - field=field, - document=document, + **args, )