mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-03-31 13:35:08 -05:00
custom field regex matching
[ci skip]
This commit is contained in:
parent
71fdc2a36d
commit
7b75333819
@ -526,7 +526,12 @@ class DocumentClassifier:
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict_custom_fields(self, content: str) -> list[int]:
|
||||
def predict_custom_fields(self, content: str) -> dict:
|
||||
"""
|
||||
Custom fields are a bit different from the other classifiers, as we
|
||||
need to predict the values for the fields, not just the field itself.
|
||||
"""
|
||||
# TODO: can this return the value?
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
if self.custom_fields_classifier:
|
||||
|
@ -132,23 +132,48 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
|
||||
)
|
||||
|
||||
|
||||
def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None):
|
||||
def match_custom_fields(
|
||||
document: Document,
|
||||
classifier: DocumentClassifier,
|
||||
user=None,
|
||||
) -> dict:
|
||||
"""
|
||||
Custom fields work differently, we need the values for the match as well.
|
||||
"""
|
||||
# TODO: this needs to return values as well
|
||||
predicted_custom_field_ids = (
|
||||
classifier.predict_custom_fields(document.content) if classifier else []
|
||||
)
|
||||
|
||||
fields = [instance.field for instance in document.custom_fields.all()]
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (
|
||||
o.matching_algorithm == MatchingModel.MATCH_AUTO
|
||||
and o.pk in predicted_custom_field_ids
|
||||
),
|
||||
fields,
|
||||
),
|
||||
)
|
||||
matched_fields = {}
|
||||
for field in fields:
|
||||
if field.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
if field.pk in predicted_custom_field_ids:
|
||||
matched_fields[field] = None
|
||||
elif field.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
try:
|
||||
match = re.search(
|
||||
re.compile(field.matching_model.match),
|
||||
document.content,
|
||||
)
|
||||
if match:
|
||||
matched_fields[field] = match.group()
|
||||
except re.error:
|
||||
logger.error(
|
||||
f"Error while processing regular expression {field.matching_model.match}",
|
||||
)
|
||||
return False
|
||||
if match:
|
||||
log_reason(
|
||||
field.matching_model,
|
||||
document,
|
||||
f"the string {match.group()} matches the regular expression "
|
||||
f"{field.matching_model.match}",
|
||||
)
|
||||
|
||||
return matched_fields
|
||||
|
||||
|
||||
def matches(matching_model: MatchingModel, document: Document):
|
||||
|
@ -322,11 +322,12 @@ def set_custom_fields(
|
||||
document: Document,
|
||||
logging_group=None,
|
||||
classifier: DocumentClassifier | None = None,
|
||||
replace=False,
|
||||
suggest=False,
|
||||
base_url=None,
|
||||
stdout=None,
|
||||
style_func=None,
|
||||
*,
|
||||
replace=False,
|
||||
suggest=False,
|
||||
**kwargs,
|
||||
):
|
||||
if replace:
|
||||
@ -336,7 +337,8 @@ def set_custom_fields(
|
||||
|
||||
current_fields = set([instance.field for instance in document.custom_fields.all()])
|
||||
|
||||
matched_fields = matching.match_custom_fields(document, classifier)
|
||||
matched_fields_w_values: dict = matching.match_custom_fields(document, classifier)
|
||||
matched_fields = matched_fields_w_values.keys()
|
||||
|
||||
relevant_fields = set(matched_fields) - current_fields
|
||||
|
||||
@ -373,9 +375,17 @@ def set_custom_fields(
|
||||
)
|
||||
|
||||
for field in relevant_fields:
|
||||
args = {
|
||||
"field": field,
|
||||
"document": document,
|
||||
}
|
||||
if field.pk in matched_fields_w_values:
|
||||
value_field_name = CustomFieldInstance.get_value_field_name(
|
||||
data_type=field.data_type,
|
||||
)
|
||||
args[value_field_name] = matched_fields_w_values[field.pk]
|
||||
CustomFieldInstance.objects.create(
|
||||
field=field,
|
||||
document=document,
|
||||
**args,
|
||||
)
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user