custom field regex matching

[ci skip]
This commit is contained in:
shamoon 2025-03-20 23:04:07 -07:00
parent 71fdc2a36d
commit 7b75333819
No known key found for this signature in database
3 changed files with 57 additions and 17 deletions

View File

@ -526,7 +526,12 @@ class DocumentClassifier:
else:
return None
def predict_custom_fields(self, content: str) -> list[int]:
def predict_custom_fields(self, content: str) -> dict:
"""
Custom fields are a bit different from the other classifiers, as we
need to predict the values for the fields, not just the field itself.
"""
# TODO: can this return the value?
from sklearn.utils.multiclass import type_of_target
if self.custom_fields_classifier:

View File

@ -132,23 +132,48 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
)
def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None):
def match_custom_fields(
document: Document,
classifier: DocumentClassifier,
user=None,
) -> dict:
"""
Custom fields work differently, we need the values for the match as well.
"""
# TODO: this needs to return values as well
predicted_custom_field_ids = (
classifier.predict_custom_fields(document.content) if classifier else []
)
fields = [instance.field for instance in document.custom_fields.all()]
return list(
filter(
lambda o: matches(o, document)
or (
o.matching_algorithm == MatchingModel.MATCH_AUTO
and o.pk in predicted_custom_field_ids
),
fields,
),
)
matched_fields = {}
for field in fields:
if field.matching_algorithm == MatchingModel.MATCH_AUTO:
if field.pk in predicted_custom_field_ids:
matched_fields[field] = None
elif field.matching_algorithm == MatchingModel.MATCH_REGEX:
try:
match = re.search(
re.compile(field.matching_model.match),
document.content,
)
if match:
matched_fields[field] = match.group()
except re.error:
logger.error(
f"Error while processing regular expression {field.matching_model.match}",
)
return False
if match:
log_reason(
field.matching_model,
document,
f"the string {match.group()} matches the regular expression "
f"{field.matching_model.match}",
)
return matched_fields
def matches(matching_model: MatchingModel, document: Document):

View File

@ -322,11 +322,12 @@ def set_custom_fields(
document: Document,
logging_group=None,
classifier: DocumentClassifier | None = None,
replace=False,
suggest=False,
base_url=None,
stdout=None,
style_func=None,
*,
replace=False,
suggest=False,
**kwargs,
):
if replace:
@ -336,7 +337,8 @@ def set_custom_fields(
current_fields = set([instance.field for instance in document.custom_fields.all()])
matched_fields = matching.match_custom_fields(document, classifier)
matched_fields_w_values: dict = matching.match_custom_fields(document, classifier)
matched_fields = matched_fields_w_values.keys()
relevant_fields = set(matched_fields) - current_fields
@ -373,9 +375,17 @@ def set_custom_fields(
)
for field in relevant_fields:
args = {
"field": field,
"document": document,
}
if field.pk in matched_fields_w_values:
value_field_name = CustomFieldInstance.get_value_field_name(
data_type=field.data_type,
)
args[value_field_name] = matched_fields_w_values[field.pk]
CustomFieldInstance.objects.create(
field=field,
document=document,
**args,
)