mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
custom field regex matching
[ci skip]
This commit is contained in:
parent
71fdc2a36d
commit
7b75333819
@ -526,7 +526,12 @@ class DocumentClassifier:
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def predict_custom_fields(self, content: str) -> list[int]:
|
def predict_custom_fields(self, content: str) -> dict:
|
||||||
|
"""
|
||||||
|
Custom fields are a bit different from the other classifiers, as we
|
||||||
|
need to predict the values for the fields, not just the field itself.
|
||||||
|
"""
|
||||||
|
# TODO: can this return the value?
|
||||||
from sklearn.utils.multiclass import type_of_target
|
from sklearn.utils.multiclass import type_of_target
|
||||||
|
|
||||||
if self.custom_fields_classifier:
|
if self.custom_fields_classifier:
|
||||||
|
@ -132,23 +132,48 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None):
|
def match_custom_fields(
|
||||||
|
document: Document,
|
||||||
|
classifier: DocumentClassifier,
|
||||||
|
user=None,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Custom fields work differently, we need the values for the match as well.
|
||||||
|
"""
|
||||||
|
# TODO: this needs to return values as well
|
||||||
predicted_custom_field_ids = (
|
predicted_custom_field_ids = (
|
||||||
classifier.predict_custom_fields(document.content) if classifier else []
|
classifier.predict_custom_fields(document.content) if classifier else []
|
||||||
)
|
)
|
||||||
|
|
||||||
fields = [instance.field for instance in document.custom_fields.all()]
|
fields = [instance.field for instance in document.custom_fields.all()]
|
||||||
|
|
||||||
return list(
|
matched_fields = {}
|
||||||
filter(
|
for field in fields:
|
||||||
lambda o: matches(o, document)
|
if field.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||||
or (
|
if field.pk in predicted_custom_field_ids:
|
||||||
o.matching_algorithm == MatchingModel.MATCH_AUTO
|
matched_fields[field] = None
|
||||||
and o.pk in predicted_custom_field_ids
|
elif field.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||||
),
|
try:
|
||||||
fields,
|
match = re.search(
|
||||||
),
|
re.compile(field.matching_model.match),
|
||||||
)
|
document.content,
|
||||||
|
)
|
||||||
|
if match:
|
||||||
|
matched_fields[field] = match.group()
|
||||||
|
except re.error:
|
||||||
|
logger.error(
|
||||||
|
f"Error while processing regular expression {field.matching_model.match}",
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
if match:
|
||||||
|
log_reason(
|
||||||
|
field.matching_model,
|
||||||
|
document,
|
||||||
|
f"the string {match.group()} matches the regular expression "
|
||||||
|
f"{field.matching_model.match}",
|
||||||
|
)
|
||||||
|
|
||||||
|
return matched_fields
|
||||||
|
|
||||||
|
|
||||||
def matches(matching_model: MatchingModel, document: Document):
|
def matches(matching_model: MatchingModel, document: Document):
|
||||||
|
@ -322,11 +322,12 @@ def set_custom_fields(
|
|||||||
document: Document,
|
document: Document,
|
||||||
logging_group=None,
|
logging_group=None,
|
||||||
classifier: DocumentClassifier | None = None,
|
classifier: DocumentClassifier | None = None,
|
||||||
replace=False,
|
|
||||||
suggest=False,
|
|
||||||
base_url=None,
|
base_url=None,
|
||||||
stdout=None,
|
stdout=None,
|
||||||
style_func=None,
|
style_func=None,
|
||||||
|
*,
|
||||||
|
replace=False,
|
||||||
|
suggest=False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if replace:
|
if replace:
|
||||||
@ -336,7 +337,8 @@ def set_custom_fields(
|
|||||||
|
|
||||||
current_fields = set([instance.field for instance in document.custom_fields.all()])
|
current_fields = set([instance.field for instance in document.custom_fields.all()])
|
||||||
|
|
||||||
matched_fields = matching.match_custom_fields(document, classifier)
|
matched_fields_w_values: dict = matching.match_custom_fields(document, classifier)
|
||||||
|
matched_fields = matched_fields_w_values.keys()
|
||||||
|
|
||||||
relevant_fields = set(matched_fields) - current_fields
|
relevant_fields = set(matched_fields) - current_fields
|
||||||
|
|
||||||
@ -373,9 +375,17 @@ def set_custom_fields(
|
|||||||
)
|
)
|
||||||
|
|
||||||
for field in relevant_fields:
|
for field in relevant_fields:
|
||||||
|
args = {
|
||||||
|
"field": field,
|
||||||
|
"document": document,
|
||||||
|
}
|
||||||
|
if field.pk in matched_fields_w_values:
|
||||||
|
value_field_name = CustomFieldInstance.get_value_field_name(
|
||||||
|
data_type=field.data_type,
|
||||||
|
)
|
||||||
|
args[value_field_name] = matched_fields_w_values[field.pk]
|
||||||
CustomFieldInstance.objects.create(
|
CustomFieldInstance.objects.create(
|
||||||
field=field,
|
**args,
|
||||||
document=document,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user