Feature: custom fields queries (#7761)

This commit is contained in:
shamoon
2024-10-02 17:15:42 -07:00
committed by GitHub
parent 2e3637d712
commit f8d79b012f
26 changed files with 2130 additions and 599 deletions

View File

@@ -29,13 +29,15 @@ from documents.models import Log
from documents.models import ShareLink
from documents.models import StoragePath
from documents.models import Tag
from paperless import settings
CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"]
ID_KWARGS = ["in", "exact"]
INT_KWARGS = ["exact", "gt", "gte", "lt", "lte", "isnull"]
DATE_KWARGS = ["year", "month", "day", "date__gt", "gt", "date__lt", "lt"]
CUSTOM_FIELD_QUERY_MAX_DEPTH = 10
CUSTOM_FIELD_QUERY_MAX_ATOMS = 20
class CorrespondentFilterSet(FilterSet):
class Meta:
@@ -234,19 +236,13 @@ def handle_validation_prefix(func: Callable):
return wrapper
class CustomFieldLookupParser:
class CustomFieldQueryParser:
EXPR_BY_CATEGORY = {
"basic": ["exact", "in", "isnull", "exists"],
"string": [
"iexact",
"contains",
"icontains",
"startswith",
"istartswith",
"endswith",
"iendswith",
"regex",
"iregex",
],
"arithmetic": [
"gt",
@@ -258,23 +254,6 @@ class CustomFieldLookupParser:
"containment": ["contains"],
}
# These string lookup expressions are problematic. We shall disable
# them by default unless the user explicitly opts in.
STR_EXPR_DISABLED_BY_DEFAULT = [
# SQLite: is case-sensitive outside the ASCII range
"iexact",
# SQLite: behaves the same as icontains
"contains",
# SQLite: behaves the same as istartswith
"startswith",
# SQLite: behaves the same as iendswith
"endswith",
# Syntax depends on database backends, can be exploited for ReDoS
"regex",
# Syntax depends on database backends, can be exploited for ReDoS
"iregex",
]
SUPPORTED_EXPR_CATEGORIES = {
CustomField.FieldDataType.STRING: ("basic", "string"),
CustomField.FieldDataType.URL: ("basic", "string"),
@@ -282,7 +261,7 @@ class CustomFieldLookupParser:
CustomField.FieldDataType.BOOL: ("basic",),
CustomField.FieldDataType.INT: ("basic", "arithmetic"),
CustomField.FieldDataType.FLOAT: ("basic", "arithmetic"),
CustomField.FieldDataType.MONETARY: ("basic", "string"),
CustomField.FieldDataType.MONETARY: ("basic", "string", "arithmetic"),
CustomField.FieldDataType.DOCUMENTLINK: ("basic", "containment"),
CustomField.FieldDataType.SELECT: ("basic",),
}
@@ -371,7 +350,7 @@ class CustomFieldLookupParser:
elif len(expr) == 3:
return self._parse_atom(*expr)
raise serializers.ValidationError(
[_("Invalid custom field lookup expression")],
[_("Invalid custom field query expression")],
)
@handle_validation_prefix
@@ -416,13 +395,7 @@ class CustomFieldLookupParser:
self._atom_count += 1
if self._atom_count > self._max_atom_count:
raise serializers.ValidationError(
[
_(
"Maximum number of query conditions exceeded. You can raise "
"the limit by setting PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_ATOMS "
"in your configuration file.",
),
],
[_("Maximum number of query conditions exceeded.")],
)
custom_field = self._get_custom_field(id_or_name, validation_prefix="0")
@@ -444,6 +417,11 @@ class CustomFieldLookupParser:
value_field_name = CustomFieldInstance.get_value_field_name(
custom_field.data_type,
)
if (
custom_field.data_type == CustomField.FieldDataType.MONETARY
and op in self.EXPR_BY_CATEGORY["arithmetic"]
):
value_field_name = "value_monetary_amount"
has_field = Q(custom_fields__field=custom_field)
# Our special exists operator.
@@ -494,22 +472,6 @@ class CustomFieldLookupParser:
# Check if the operator is supported for the current data_type.
supported = False
for category in self.SUPPORTED_EXPR_CATEGORIES[custom_field.data_type]:
if (
category == "string"
and op in self.STR_EXPR_DISABLED_BY_DEFAULT
and op not in settings.CUSTOM_FIELD_LOOKUP_OPT_IN
):
raise serializers.ValidationError(
[
_(
"{expr!r} is disabled by default because it does not "
"behave consistently across database backends, or can "
"cause security risks. If you understand the implications "
"you may enabled it by adding it to "
"`PAPERLESS_CUSTOM_FIELD_LOOKUP_OPT_IN`.",
).format(expr=op),
],
)
if op in self.EXPR_BY_CATEGORY[category]:
supported = True
break
@@ -527,7 +489,7 @@ class CustomFieldLookupParser:
if not supported:
raise serializers.ValidationError(
[
_("{data_type} does not support lookup expr {expr!r}.").format(
_("{data_type} does not support query expr {expr!r}.").format(
data_type=custom_field.data_type,
expr=raw_op,
),
@@ -548,7 +510,7 @@ class CustomFieldLookupParser:
custom_field.data_type == CustomField.FieldDataType.DATE
and prefix in self.DATE_COMPONENTS
):
# DateField admits lookups in the form of `year__exact`, etc. These take integers.
# DateField admits queries in the form of `year__exact`, etc. These take integers.
field = serializers.IntegerField()
elif custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
# We can be more specific here and make sure the value is a list.
@@ -610,7 +572,7 @@ class CustomFieldLookupParser:
custom_fields__value_document_ids__isnull=False,
)
# First we lookup reverse links from the requested documents.
# First we look up reverse links from the requested documents.
links = CustomFieldInstance.objects.filter(
document_id__in=value,
field__data_type=CustomField.FieldDataType.DOCUMENTLINK,
@@ -635,22 +597,14 @@ class CustomFieldLookupParser:
# guard against queries that are too deeply nested
self._current_depth += 1
if self._current_depth > self._max_query_depth:
raise serializers.ValidationError(
[
_(
"Maximum nesting depth exceeded. You can raise the limit "
"by setting PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_DEPTH in "
"your configuration file.",
),
],
)
raise serializers.ValidationError([_("Maximum nesting depth exceeded.")])
try:
yield
finally:
self._current_depth -= 1
class CustomFieldLookupFilter(Filter):
class CustomFieldQueryFilter(Filter):
def __init__(self, validation_prefix):
"""
A filter that filters documents based on custom field name and value.
@@ -665,10 +619,10 @@ class CustomFieldLookupFilter(Filter):
if not value:
return qs
parser = CustomFieldLookupParser(
parser = CustomFieldQueryParser(
self._validation_prefix,
max_query_depth=settings.CUSTOM_FIELD_LOOKUP_MAX_DEPTH,
max_atom_count=settings.CUSTOM_FIELD_LOOKUP_MAX_ATOMS,
max_query_depth=CUSTOM_FIELD_QUERY_MAX_DEPTH,
max_atom_count=CUSTOM_FIELD_QUERY_MAX_ATOMS,
)
q, annotations = parser.parse(value)
@@ -722,7 +676,7 @@ class DocumentFilterSet(FilterSet):
exclude=True,
)
custom_field_lookup = CustomFieldLookupFilter("custom_field_lookup")
custom_field_query = CustomFieldQueryFilter("custom_field_query")
shared_by__id = SharedByUser()

View File

@@ -0,0 +1,95 @@
# Generated by Django 5.1.1 on 2024-09-29 16:26
import django.db.models.functions.comparison
import django.db.models.functions.text
from django.db import migrations
from django.db import models
class Migration(migrations.Migration):
dependencies = [
("documents", "1053_document_page_count"),
]
operations = [
migrations.AddField(
model_name="customfieldinstance",
name="value_monetary_amount",
field=models.GeneratedField(
db_persist=True,
expression=models.Case(
models.When(
then=django.db.models.functions.comparison.Cast(
django.db.models.functions.text.Substr("value_monetary", 1),
output_field=models.DecimalField(
decimal_places=2,
max_digits=65,
),
),
value_monetary__regex="^\\d+",
),
default=django.db.models.functions.comparison.Cast(
django.db.models.functions.text.Substr("value_monetary", 4),
output_field=models.DecimalField(
decimal_places=2,
max_digits=65,
),
),
output_field=models.DecimalField(decimal_places=2, max_digits=65),
),
output_field=models.DecimalField(decimal_places=2, max_digits=65),
),
),
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
(23, "ASN greater than"),
(24, "ASN less than"),
(25, "storage path is"),
(26, "has correspondent in"),
(27, "does not have correspondent in"),
(28, "has document type in"),
(29, "does not have document type in"),
(30, "has storage path in"),
(31, "does not have storage path in"),
(32, "owner is"),
(33, "has owner in"),
(34, "does not have owner"),
(35, "does not have owner in"),
(36, "has custom field value"),
(37, "is shared by me"),
(38, "has custom fields"),
(39, "has custom field in"),
(40, "does not have custom field in"),
(41, "does not have custom field"),
(42, "custom fields query"),
],
verbose_name="rule type",
),
),
]

View File

@@ -22,6 +22,9 @@ from multiselectfield import MultiSelectField
if settings.AUDIT_LOG_ENABLED:
from auditlog.registry import auditlog
from django.db.models import Case
from django.db.models.functions import Cast
from django.db.models.functions import Substr
from django_softdelete.models import SoftDeleteModel
from documents.data_models import DocumentSource
@@ -519,6 +522,7 @@ class SavedViewFilterRule(models.Model):
(39, _("has custom field in")),
(40, _("does not have custom field in")),
(41, _("does not have custom field")),
(42, _("custom fields query")),
]
saved_view = models.ForeignKey(
@@ -921,6 +925,27 @@ class CustomFieldInstance(models.Model):
value_monetary = models.CharField(null=True, max_length=128)
value_monetary_amount = models.GeneratedField(
expression=Case(
# If the value starts with a number and no currency symbol, use the whole string
models.When(
value_monetary__regex=r"^\d+",
then=Cast(
Substr("value_monetary", 1),
output_field=models.DecimalField(decimal_places=2, max_digits=65),
),
),
# If the value starts with a 3-char currency symbol, use the rest of the string
default=Cast(
Substr("value_monetary", 4),
output_field=models.DecimalField(decimal_places=2, max_digits=65),
),
output_field=models.DecimalField(decimal_places=2, max_digits=65),
),
output_field=models.DecimalField(decimal_places=2, max_digits=65),
db_persist=True,
)
value_document_ids = models.JSONField(null=True)
value_select = models.PositiveSmallIntegerField(null=True)

View File

@@ -1,11 +1,9 @@
import json
import re
from collections.abc import Callable
from datetime import date
from unittest.mock import Mock
from urllib.parse import quote
import pytest
from django.contrib.auth.models import User
from rest_framework.test import APITestCase
@@ -13,7 +11,6 @@ from documents.models import CustomField
from documents.models import Document
from documents.serialisers import DocumentSerializer
from documents.tests.utils import DirectoriesMixin
from paperless import settings
class DocumentWrapper:
@@ -31,11 +28,7 @@ class DocumentWrapper:
return self._document.custom_fields.get(field__name=custom_field).value
def string_expr_opted_in(op):
return op in settings.CUSTOM_FIELD_LOOKUP_OPT_IN
class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
class TestCustomFieldsSearch(DirectoriesMixin, APITestCase):
def setUp(self):
super().setUp()
@@ -111,6 +104,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self._create_document(monetary_field="USD100.00")
self._create_document(monetary_field="USD1.00")
self._create_document(monetary_field="EUR50.00")
self._create_document(monetary_field="101.00")
# CustomField.FieldDataType.DOCUMENTLINK
self._create_document(documentlink_field=None)
@@ -188,7 +182,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
"/api/documents/?"
+ "&".join(
(
f"custom_field_lookup={query_string}",
f"custom_field_query={query_string}",
"ordering=archive_serial_number",
"page=1",
f"page_size={len(self.documents)}",
@@ -212,7 +206,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
"/api/documents/?"
+ "&".join(
(
f"custom_field_lookup={query_string}",
f"custom_field_query={query_string}",
"ordering=archive_serial_number",
"page=1",
f"page_size={len(self.documents)}",
@@ -313,32 +307,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
# ==========================================================#
# Expressions for string, URL, and monetary fields #
# ==========================================================#
@pytest.mark.skipif(
not string_expr_opted_in("iexact"),
reason="iexact expr is disabled.",
)
def test_iexact(self):
self._assert_query_match_predicate(
["string_field", "iexact", "paperless"],
lambda document: "string_field" in document
and document["string_field"] is not None
and document["string_field"].lower() == "paperless",
)
@pytest.mark.skipif(
not string_expr_opted_in("contains"),
reason="contains expr is disabled.",
)
def test_contains(self):
# WARNING: SQLite treats "contains" as "icontains"!
# You should avoid "contains" unless you know what you are doing!
self._assert_query_match_predicate(
["string_field", "contains", "aper"],
lambda document: "string_field" in document
and document["string_field"] is not None
and "aper" in document["string_field"],
)
def test_icontains(self):
self._assert_query_match_predicate(
["string_field", "icontains", "aper"],
@@ -347,20 +315,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
and "aper" in document["string_field"].lower(),
)
@pytest.mark.skipif(
not string_expr_opted_in("startswith"),
reason="startswith expr is disabled.",
)
def test_startswith(self):
# WARNING: SQLite treats "startswith" as "istartswith"!
# You should avoid "startswith" unless you know what you are doing!
self._assert_query_match_predicate(
["string_field", "startswith", "paper"],
lambda document: "string_field" in document
and document["string_field"] is not None
and document["string_field"].startswith("paper"),
)
def test_istartswith(self):
self._assert_query_match_predicate(
["string_field", "istartswith", "paper"],
@@ -369,20 +323,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
and document["string_field"].lower().startswith("paper"),
)
@pytest.mark.skipif(
not string_expr_opted_in("endswith"),
reason="endswith expr is disabled.",
)
def test_endswith(self):
# WARNING: SQLite treats "endswith" as "iendswith"!
# You should avoid "endswith" unless you know what you are doing!
self._assert_query_match_predicate(
["string_field", "iendswith", "less"],
lambda document: "string_field" in document
and document["string_field"] is not None
and document["string_field"].lower().endswith("less"),
)
def test_iendswith(self):
self._assert_query_match_predicate(
["string_field", "iendswith", "less"],
@@ -391,32 +331,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
and document["string_field"].lower().endswith("less"),
)
@pytest.mark.skipif(
not string_expr_opted_in("regex"),
reason="regex expr is disabled.",
)
def test_regex(self):
# WARNING: the regex syntax is database dependent!
self._assert_query_match_predicate(
["string_field", "regex", r"^p.+s$"],
lambda document: "string_field" in document
and document["string_field"] is not None
and re.match(r"^p.+s$", document["string_field"]),
)
@pytest.mark.skipif(
not string_expr_opted_in("iregex"),
reason="iregex expr is disabled.",
)
def test_iregex(self):
# WARNING: the regex syntax is database dependent!
self._assert_query_match_predicate(
["string_field", "iregex", r"^p.+s$"],
lambda document: "string_field" in document
and document["string_field"] is not None
and re.match(r"^p.+s$", document["string_field"], re.IGNORECASE),
)
def test_url_field_istartswith(self):
# URL fields supports all of the expressions above.
# Just showing one of them here.
@@ -427,28 +341,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
and document["url_field"].startswith("http://"),
)
@pytest.mark.skipif(
not string_expr_opted_in("iregex"),
reason="regex expr is disabled.",
)
def test_monetary_field_iregex(self):
# Monetary fields supports all of the expressions above.
# Just showing one of them here.
#
# Unfortunately we can't do arithmetic comparisons on monetary field,
# but you are welcome to use regex to do some of that.
# E.g., USD between 100.00 and 999.99:
self._assert_query_match_predicate(
["monetary_field", "regex", r"USD[1-9][0-9]{2}\.[0-9]{2}"],
lambda document: "monetary_field" in document
and document["monetary_field"] is not None
and re.match(
r"USD[1-9][0-9]{2}\.[0-9]{2}",
document["monetary_field"],
re.IGNORECASE,
),
)
# ==========================================================#
# Arithmetic comparisons #
# ==========================================================#
@@ -502,6 +394,17 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
and document["date_field"].year >= 2024,
)
def test_gt_monetary(self):
self._assert_query_match_predicate(
["monetary_field", "gt", "99"],
lambda document: "monetary_field" in document
and document["monetary_field"] is not None
and (
document["monetary_field"] == "USD100.00" # With currency symbol
or document["monetary_field"] == "101.00" # No currency symbol
),
)
# ==========================================================#
# Subset check (document link field only) #
# ==========================================================#
@@ -586,68 +489,57 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
def test_invalid_json(self):
self._assert_validation_error(
"not valid json",
["custom_field_lookup"],
["custom_field_query"],
"must be valid JSON",
)
def test_invalid_expression(self):
self._assert_validation_error(
json.dumps("valid json but not valid expr"),
["custom_field_lookup"],
"Invalid custom field lookup expression",
["custom_field_query"],
"Invalid custom field query expression",
)
def test_invalid_custom_field_name(self):
self._assert_validation_error(
json.dumps(["invalid name", "iexact", "foo"]),
["custom_field_lookup", "0"],
["custom_field_query", "0"],
"is not a valid custom field",
)
def test_invalid_operator(self):
self._assert_validation_error(
json.dumps(["integer_field", "iexact", "foo"]),
["custom_field_lookup", "1"],
"does not support lookup expr",
["custom_field_query", "1"],
"does not support query expr",
)
def test_invalid_value(self):
self._assert_validation_error(
json.dumps(["select_field", "exact", "not an option"]),
["custom_field_lookup", "2"],
["custom_field_query", "2"],
"integer",
)
def test_invalid_logical_operator(self):
self._assert_validation_error(
json.dumps(["invalid op", ["integer_field", "gt", 0]]),
["custom_field_lookup", "0"],
["custom_field_query", "0"],
"Invalid logical operator",
)
def test_invalid_expr_list(self):
self._assert_validation_error(
json.dumps(["AND", "not a list"]),
["custom_field_lookup", "1"],
["custom_field_query", "1"],
"Invalid expression list",
)
def test_invalid_operator_prefix(self):
self._assert_validation_error(
json.dumps(["integer_field", "foo__gt", 0]),
["custom_field_lookup", "1"],
"does not support lookup expr",
)
@pytest.mark.skipif(
string_expr_opted_in("regex"),
reason="user opted into allowing regex expr",
)
def test_disabled_operator(self):
self._assert_validation_error(
json.dumps(["string_field", "regex", r"^p.+s$"]),
["custom_field_lookup", "1"],
"disabled by default",
["custom_field_query", "1"],
"does not support query expr",
)
def test_query_too_deep(self):
@@ -656,7 +548,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
query = ["NOT", query]
self._assert_validation_error(
json.dumps(query),
["custom_field_lookup", *(["1"] * 10)],
["custom_field_query", *(["1"] * 10)],
"Maximum nesting depth exceeded",
)
@@ -665,6 +557,6 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
query = ["AND", [atom for _ in range(21)]]
self._assert_validation_error(
json.dumps(query),
["custom_field_lookup", "1", "20"],
["custom_field_query", "1", "20"],
"Maximum number of query conditions exceeded",
)

View File

@@ -1195,20 +1195,3 @@ EMAIL_ENABLE_GPG_DECRYPTOR: Final[bool] = __get_boolean(
# Soft Delete #
###############################################################################
EMPTY_TRASH_DELAY = max(__get_int("PAPERLESS_EMPTY_TRASH_DELAY", 30), 1)
###############################################################################
# custom_field_lookup Filter Settings #
###############################################################################
CUSTOM_FIELD_LOOKUP_OPT_IN = __get_list(
"PAPERLESS_CUSTOM_FIELD_LOOKUP_OPT_IN",
default=[],
)
CUSTOM_FIELD_LOOKUP_MAX_DEPTH = __get_int(
"PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_DEPTH",
default=10,
)
CUSTOM_FIELD_LOOKUP_MAX_ATOMS = __get_int(
"PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_ATOMS",
default=20,
)