Feature: Enhanced backend custom field search API (#7589)

commit 910dae8413028f647e6295f30207cb5d4fc6605d
Author: Yichi Yang <yiy067@ucsd.edu>
Date:   Wed Sep 4 12:47:19 2024 -0700

    Fix: correctly handle the case where custom_field_lookup refers to multiple fields

commit e43f70d708b7d6b445f3ca8c8bf9dbdf5ee26085
Author: Yichi Yang <yiy067@ucsd.edu>
Date:   Sat Aug 31 14:06:45 2024 -0700

Co-Authored-By: Yichi Yang <yichiyan@usc.edu>
This commit is contained in:
shamoon 2024-09-23 11:28:31 -07:00
parent f06ff85b7d
commit d7ba6d98d3
7 changed files with 1270 additions and 38 deletions

View File

@ -235,12 +235,6 @@ results:
Pagination works exactly the same as it does for normal requests on this
endpoint.
Certain limitations apply to full text queries:
- Results are always sorted by search score. The results matching the
query best will show up first.
- Only a small subset of filtering parameters are supported.
Furthermore, each returned document has an additional `__search_hit__`
attribute with various information about the search results:
@ -280,6 +274,67 @@ attribute with various information about the search results:
- `rank` is the index of the search results. The first result will
have rank 0.
### Filtering by custom fields
You can filter documents by their custom field values by specifying the
`custom_field_lookup` query parameter. Here are some recipes for common
use cases:
1. Documents with a custom field "due" (date) between Aug 1, 2024 and
Sept 1, 2024 (inclusive):
`?custom_field_lookup=["due", "range", ["2024-08-01", "2024-09-01"]]`
2. Documents with a custom field "customer" (text) that equals "bob"
(case sensitive):
`?custom_field_lookup=["customer", "exact", "bob"]`
3. Documents with a custom field "answered" (boolean) set to `true`:
`?custom_field_lookup=["answered", "exact", true]`
4. Documents with a custom field "favorite animal" (select) set to either
"cat" or "dog":
`?custom_field_lookup=["favorite animal", "in", ["cat", "dog"]]`
5. Documents with a custom field "address" (text) that is empty:
`?custom_field_lookup=["OR", ["address", "isnull", true], ["address", "exact", ""]]`
6. Documents that don't have a field called "foo":
`?custom_field_lookup=["foo", "exists", false]`
7. Documents that have document links "references" to both document 3 and 7:
`?custom_field_lookup=["references", "contains", [3, 7]]`
All field types support basic operations including `exact`, `in`, `isnull`,
and `exists`. String, URL, and monetary fields support case-insensitive
substring matching operations including `icontains`, `istartswith`, and
`iendswith`. Integer, float, and date fields support arithmetic comparisons
including `gt` (>), `gte` (>=), `lt` (<), `lte` (<=), and `range`.
Lastly, document link fields support a `contains` operator that behaves
like a "is superset of" check.
!!! warning
It is possible to do case-insensitive exact match (i.e., `iexact`) and
case-sensitive substring match (i.e., `contains`, `startswith`,
`endswith`) for string, URL, and monetary fields, but
[they may not work as expected on some database backends](https://docs.djangoproject.com/en/5.1/ref/databases/#substring-matching-and-case-sensitivity).
It is also possible to use regular expressions to match string, URL, and
monetary fields, but the syntax is database-dependent, and accepting
regular expressions from untrusted sources could make your instance
vulnerable to regular expression denial of service attacks.
For these reasons the above expressions are disabled by default.
If you understand the implications, you may enable them by uncommenting
`PAPERLESS_CUSTOM_FIELD_LOOKUP_OPT_IN` in your configuration file.
### `/api/search/autocomplete/`
Get auto completions for a partial search term.

View File

@ -81,6 +81,7 @@
#PAPERLESS_THUMBNAIL_FONT_NAME=
#PAPERLESS_IGNORE_DATES=
#PAPERLESS_ENABLE_UPDATE_CHECK=
#PAPERLESS_ALLOW_CUSTOM_FIELD_LOOKUP=iexact,contains,startswith,endswith,regex,iregex
# Tika settings

View File

@ -1,24 +1,36 @@
import functools
import inspect
import json
import operator
from contextlib import contextmanager
from typing import Callable
from typing import Union
from django.contrib.contenttypes.models import ContentType
from django.db.models import CharField
from django.db.models import Count
from django.db.models import OuterRef
from django.db.models import Q
from django.db.models.functions import Cast
from django.utils.translation import gettext_lazy as _
from django_filters.rest_framework import BooleanFilter
from django_filters.rest_framework import Filter
from django_filters.rest_framework import FilterSet
from guardian.utils import get_group_obj_perms_model
from guardian.utils import get_user_obj_perms_model
from rest_framework import serializers
from rest_framework_guardian.filters import ObjectPermissionsFilter
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
from documents.models import Log
from documents.models import ShareLink
from documents.models import StoragePath
from documents.models import Tag
from paperless import settings
CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"]
ID_KWARGS = ["in", "exact"]
@ -182,6 +194,488 @@ class CustomFieldsFilter(Filter):
return qs
class SelectField(serializers.IntegerField):
def __init__(self, custom_field: CustomField):
self._options = custom_field.extra_data["select_options"]
super().__init__(min_value=0, max_value=len(self._options))
def to_internal_value(self, data):
if not isinstance(data, int):
# If the supplied value is not an integer,
# we will try to map it to an option index.
try:
data = self._options.index(data)
except ValueError:
pass
return super().to_internal_value(data)
def handle_validation_prefix(func: Callable):
"""
Catch ValidationErrors raised by the wrapped function
and add a prefix to the exception detail to track what causes the exception,
similar to nested serializers.
"""
def wrapper(*args, validation_prefix=None, **kwargs):
try:
return func(*args, **kwargs)
except serializers.ValidationError as e:
raise serializers.ValidationError({validation_prefix: e.detail})
# Update the signature to include the validation_prefix argument
old_sig = inspect.signature(func)
new_param = inspect.Parameter("validation_prefix", inspect.Parameter.KEYWORD_ONLY)
new_sig = old_sig.replace(parameters=[*old_sig.parameters.values(), new_param])
# Apply functools.wraps and manually set the new signature
functools.update_wrapper(wrapper, func)
wrapper.__signature__ = new_sig
return wrapper
class CustomFieldLookupParser:
EXPR_BY_CATEGORY = {
"basic": ["exact", "in", "isnull", "exists"],
"string": [
"iexact",
"contains",
"icontains",
"startswith",
"istartswith",
"endswith",
"iendswith",
"regex",
"iregex",
],
"arithmetic": [
"gt",
"gte",
"lt",
"lte",
"range",
],
"containment": ["contains"],
}
# These string lookup expressions are problematic. We shall disable
# them by default unless the user explicitly opts in.
STR_EXPR_DISABLED_BY_DEFAULT = [
# SQLite: is case-sensitive outside the ASCII range
"iexact",
# SQLite: behaves the same as icontains
"contains",
# SQLite: behaves the same as istartswith
"startswith",
# SQLite: behaves the same as iendswith
"endswith",
# Syntax depends on database backends, can be exploited for ReDoS
"regex",
# Syntax depends on database backends, can be exploited for ReDoS
"iregex",
]
SUPPORTED_EXPR_CATEGORIES = {
CustomField.FieldDataType.STRING: ("basic", "string"),
CustomField.FieldDataType.URL: ("basic", "string"),
CustomField.FieldDataType.DATE: ("basic", "arithmetic"),
CustomField.FieldDataType.BOOL: ("basic",),
CustomField.FieldDataType.INT: ("basic", "arithmetic"),
CustomField.FieldDataType.FLOAT: ("basic", "arithmetic"),
CustomField.FieldDataType.MONETARY: ("basic", "string"),
CustomField.FieldDataType.DOCUMENTLINK: ("basic", "containment"),
CustomField.FieldDataType.SELECT: ("basic",),
}
DATE_COMPONENTS = [
"year",
"iso_year",
"month",
"day",
"week",
"week_day",
"iso_week_day",
"quarter",
]
def __init__(
self,
validation_prefix,
max_query_depth=10,
max_atom_count=20,
) -> None:
"""
A helper class that parses the query string into a `django.db.models.Q` for filtering
documents based on custom field values.
The syntax of the query expression is illustrated with the below pseudo code rules:
1. parse([`custom_field`, "exists", true]):
matches documents with Q(custom_fields__field=`custom_field`)
2. parse([`custom_field`, "exists", false]):
matches documents with ~Q(custom_fields__field=`custom_field`)
3. parse([`custom_field`, `op`, `value`]):
matches documents with
Q(custom_fields__field=`custom_field`, custom_fields__value_`type`__`op`= `value`)
4. parse(["AND", [`q0`, `q1`, ..., `qn`]])
-> parse(`q0`) & parse(`q1`) & ... & parse(`qn`)
5. parse(["OR", [`q0`, `q1`, ..., `qn`]])
-> parse(`q0`) | parse(`q1`) | ... | parse(`qn`)
6. parse(["NOT", `q`])
-> ~parse(`q`)
Args:
validation_prefix: Used to generate the ValidationError message.
max_query_depth: Limits the maximum nesting depth of queries.
max_atom_count: Limits the maximum number of atoms (i.e., rule 1, 2, 3) in the query.
`max_query_depth` and `max_atom_count` can be set to guard against generating arbitrarily
complex SQL queries.
"""
self._custom_fields: dict[Union[int, str], CustomField] = {}
self._validation_prefix = validation_prefix
# Dummy ModelSerializer used to convert a Django models.Field to serializers.Field.
self._model_serializer = serializers.ModelSerializer()
# Used for sanity check
self._max_query_depth = max_query_depth
self._max_atom_count = max_atom_count
self._current_depth = 0
self._atom_count = 0
# The set of annotations that we need to apply to the queryset
self._annotations = {}
def parse(self, query: str) -> tuple[Q, dict[str, Count]]:
"""
Parses the query string into a `django.db.models.Q`
and a set of annotations to be applied to the queryset.
"""
try:
expr = json.loads(query)
except json.JSONDecodeError:
raise serializers.ValidationError(
{self._validation_prefix: [_("Value must be valid JSON.")]},
)
return (
self._parse_expr(expr, validation_prefix=self._validation_prefix),
self._annotations,
)
@handle_validation_prefix
def _parse_expr(self, expr) -> Q:
"""
Applies rule (1, 2, 3) or (4, 5, 6) based on the length of the expr.
"""
with self._track_query_depth():
if isinstance(expr, (list, tuple)):
if len(expr) == 2:
return self._parse_logical_expr(*expr)
elif len(expr) == 3:
return self._parse_atom(*expr)
raise serializers.ValidationError(
[_("Invalid custom field lookup expression")],
)
@handle_validation_prefix
def _parse_expr_list(self, exprs) -> list[Q]:
"""
Handles [`q0`, `q1`, ..., `qn`] in rule 4 & 5.
"""
if not isinstance(exprs, (list, tuple)) or not exprs:
raise serializers.ValidationError(
[_("Invalid expression list. Must be nonempty.")],
)
return [
self._parse_expr(expr, validation_prefix=i) for i, expr in enumerate(exprs)
]
def _parse_logical_expr(self, op, args) -> Q:
"""
Handles rule 4, 5, 6.
"""
op_lower = op.lower()
if op_lower == "not":
return ~self._parse_expr(args, validation_prefix=1)
if op_lower == "and":
op_func = operator.and_
elif op_lower == "or":
op_func = operator.or_
else:
raise serializers.ValidationError(
{"0": [_("Invalid logical operator {op!r}").format(op=op)]},
)
qs = self._parse_expr_list(args, validation_prefix="1")
return functools.reduce(op_func, qs)
def _parse_atom(self, id_or_name, op, value) -> Q:
"""
Handles rule 1, 2, 3.
"""
# Guard against queries with too many conditions.
self._atom_count += 1
if self._atom_count > self._max_atom_count:
raise serializers.ValidationError(
[
_(
"Maximum number of query conditions exceeded. You can raise "
"the limit by setting PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_ATOMS "
"in your configuration file.",
),
],
)
custom_field = self._get_custom_field(id_or_name, validation_prefix="0")
op = self._validate_atom_op(custom_field, op, validation_prefix="1")
value = self._validate_atom_value(
custom_field,
op,
value,
validation_prefix="2",
)
# Needed because not all DB backends support Array __contains
if (
custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK
and op == "contains"
):
return self._parse_atom_doc_link_contains(custom_field, value)
value_field_name = CustomFieldInstance.get_value_field_name(
custom_field.data_type,
)
has_field = Q(custom_fields__field=custom_field)
# Our special exists operator.
if op == "exists":
field_filter = has_field if value else ~has_field
else:
field_filter = has_field & Q(
**{f"custom_fields__{value_field_name}__{op}": value},
)
# We need to use an annotation here because different atoms
# might be referring to different instances of custom fields.
annotation_name = f"_custom_field_filter_{len(self._annotations)}"
self._annotations[annotation_name] = Count("custom_fields", filter=field_filter)
return Q(**{f"{annotation_name}__gt": 0})
@handle_validation_prefix
def _get_custom_field(self, id_or_name):
"""Get the CustomField instance by id or name."""
if id_or_name in self._custom_fields:
return self._custom_fields[id_or_name]
kwargs = (
{"id": id_or_name} if isinstance(id_or_name, int) else {"name": id_or_name}
)
try:
custom_field = CustomField.objects.get(**kwargs)
except CustomField.DoesNotExist:
raise serializers.ValidationError(
[_("{name!r} is not a valid custom field.").format(name=id_or_name)],
)
self._custom_fields[custom_field.id] = custom_field
self._custom_fields[custom_field.name] = custom_field
return custom_field
@staticmethod
def _split_op(full_op):
*prefix, op = str(full_op).rsplit("__", maxsplit=1)
prefix = prefix[0] if prefix else None
return prefix, op
@handle_validation_prefix
def _validate_atom_op(self, custom_field, raw_op):
"""Check if the `op` is compatible with the type of the custom field."""
prefix, op = self._split_op(raw_op)
# Check if the operator is supported for the current data_type.
supported = False
for category in self.SUPPORTED_EXPR_CATEGORIES[custom_field.data_type]:
if (
category == "string"
and op in self.STR_EXPR_DISABLED_BY_DEFAULT
and op not in settings.CUSTOM_FIELD_LOOKUP_OPT_IN
):
raise serializers.ValidationError(
[
_(
"{expr!r} is disabled by default because it does not "
"behave consistently across database backends, or can "
"cause security risks. If you understand the implications "
"you may enabled it by adding it to "
"`PAPERLESS_CUSTOM_FIELD_LOOKUP_OPT_IN`.",
).format(expr=op),
],
)
if op in self.EXPR_BY_CATEGORY[category]:
supported = True
break
# Check prefix
if prefix is not None:
if (
prefix in self.DATE_COMPONENTS
and custom_field.data_type == CustomField.FieldDataType.DATE
):
pass # ok - e.g., "year__exact" for date field
else:
supported = False # anything else is invalid
if not supported:
raise serializers.ValidationError(
[
_("{data_type} does not support lookup expr {expr!r}.").format(
data_type=custom_field.data_type,
expr=raw_op,
),
],
)
return raw_op
def _get_serializer_field(self, custom_field, full_op):
"""Return a serializers.Field for value validation."""
prefix, op = self._split_op(full_op)
field = None
if op in ("isnull", "exists"):
# `isnull` takes either True or False regardless of the data_type.
field = serializers.BooleanField()
elif (
custom_field.data_type == CustomField.FieldDataType.DATE
and prefix in self.DATE_COMPONENTS
):
# DateField admits lookups in the form of `year__exact`, etc. These take integers.
field = serializers.IntegerField()
elif custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
# We can be more specific here and make sure the value is a list.
field = serializers.ListField(child=serializers.IntegerField())
elif custom_field.data_type == CustomField.FieldDataType.SELECT:
# We use this custom field to permit SELECT option names.
field = SelectField(custom_field)
elif custom_field.data_type == CustomField.FieldDataType.URL:
# For URL fields we don't need to be strict about validation (e.g., for istartswith).
field = serializers.CharField()
else:
# The general case: inferred from the corresponding field in CustomFieldInstance.
value_field_name = CustomFieldInstance.get_value_field_name(
custom_field.data_type,
)
model_field = CustomFieldInstance._meta.get_field(value_field_name)
field_name = model_field.deconstruct()[0]
field_class, field_kwargs = self._model_serializer.build_standard_field(
field_name,
model_field,
)
field = field_class(**field_kwargs)
field.allow_null = False
# Need to set allow_blank manually because of the inconsistency in CustomFieldInstance validation.
# See https://github.com/paperless-ngx/paperless-ngx/issues/7361.
if isinstance(field, serializers.CharField):
field.allow_blank = True
if op == "in":
# `in` takes a list of values.
field = serializers.ListField(child=field, allow_empty=False)
elif op == "range":
# `range` takes a list of values, i.e., [start, end].
field = serializers.ListField(
child=field,
min_length=2,
max_length=2,
)
return field
@handle_validation_prefix
def _validate_atom_value(self, custom_field, op, value):
"""Check if `value` is valid for the custom field and `op`. Returns the validated value."""
serializer_field = self._get_serializer_field(custom_field, op)
return serializer_field.run_validation(value)
def _parse_atom_doc_link_contains(self, custom_field, value) -> Q:
"""
Handles document link `contains` in a way that is supported by all DB backends.
"""
# If the value is an empty set,
# this is trivially true for any document with not null document links.
if not value:
return Q(
custom_fields__field=custom_field,
custom_fields__value_document_ids__isnull=False,
)
# First we lookup reverse links from the requested documents.
links = CustomFieldInstance.objects.filter(
document_id__in=value,
field__data_type=CustomField.FieldDataType.DOCUMENTLINK,
)
# Check if any of the requested IDs are missing.
missing_ids = set(value) - set(link.document_id for link in links)
if missing_ids:
# The result should be an empty set in this case.
return Q(id__in=[])
# Take the intersection of the reverse links - this should be what we are looking for.
document_ids_we_want = functools.reduce(
operator.and_,
(set(link.value_document_ids) for link in links),
)
return Q(id__in=document_ids_we_want)
@contextmanager
def _track_query_depth(self):
# guard against queries that are too deeply nested
self._current_depth += 1
if self._current_depth > self._max_query_depth:
raise serializers.ValidationError(
[
_(
"Maximum nesting depth exceeded. You can raise the limit "
"by setting PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_DEPTH in "
"your configuration file.",
),
],
)
try:
yield
finally:
self._current_depth -= 1
class CustomFieldLookupFilter(Filter):
def __init__(self, validation_prefix):
"""
A filter that filters documents based on custom field name and value.
Args:
validation_prefix: Used to generate the ValidationError message.
"""
super().__init__()
self._validation_prefix = validation_prefix
def filter(self, qs, value):
if not value:
return qs
parser = CustomFieldLookupParser(
self._validation_prefix,
max_query_depth=settings.CUSTOM_FIELD_LOOKUP_MAX_DEPTH,
max_atom_count=settings.CUSTOM_FIELD_LOOKUP_MAX_ATOMS,
)
q, annotations = parser.parse(value)
return qs.annotate(**annotations).filter(q)
class DocumentFilterSet(FilterSet):
is_tagged = BooleanFilter(
label="Is tagged",
@ -229,6 +723,8 @@ class DocumentFilterSet(FilterSet):
exclude=True,
)
custom_field_lookup = CustomFieldLookupFilter("custom_field_lookup")
shared_by__id = SharedByUser()
class Meta:

View File

@ -857,6 +857,18 @@ class CustomFieldInstance(models.Model):
and attached to a single Document to be metadata for it
"""
TYPE_TO_DATA_STORE_NAME_MAP = {
CustomField.FieldDataType.STRING: "value_text",
CustomField.FieldDataType.URL: "value_url",
CustomField.FieldDataType.DATE: "value_date",
CustomField.FieldDataType.BOOL: "value_bool",
CustomField.FieldDataType.INT: "value_int",
CustomField.FieldDataType.FLOAT: "value_float",
CustomField.FieldDataType.MONETARY: "value_monetary",
CustomField.FieldDataType.DOCUMENTLINK: "value_document_ids",
CustomField.FieldDataType.SELECT: "value_select",
}
created = models.DateTimeField(
_("created"),
default=timezone.now,
@ -923,31 +935,21 @@ class CustomFieldInstance(models.Model):
)
return str(self.field.name) + f" : {value}"
@classmethod
def get_value_field_name(cls, data_type: CustomField.FieldDataType):
try:
return cls.TYPE_TO_DATA_STORE_NAME_MAP[data_type]
except KeyError: # pragma: no cover
raise NotImplementedError(data_type)
@property
def value(self):
"""
Based on the data type, access the actual value the instance stores
A little shorthand/quick way to get what is actually here
"""
if self.field.data_type == CustomField.FieldDataType.STRING:
return self.value_text
elif self.field.data_type == CustomField.FieldDataType.URL:
return self.value_url
elif self.field.data_type == CustomField.FieldDataType.DATE:
return self.value_date
elif self.field.data_type == CustomField.FieldDataType.BOOL:
return self.value_bool
elif self.field.data_type == CustomField.FieldDataType.INT:
return self.value_int
elif self.field.data_type == CustomField.FieldDataType.FLOAT:
return self.value_float
elif self.field.data_type == CustomField.FieldDataType.MONETARY:
return self.value_monetary
elif self.field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
return self.value_document_ids
elif self.field.data_type == CustomField.FieldDataType.SELECT:
return self.value_select
raise NotImplementedError(self.field.data_type)
value_field_name = self.get_value_field_name(self.field.data_type)
return getattr(self, value_field_name)
if settings.AUDIT_LOG_ENABLED:

View File

@ -578,23 +578,14 @@ class CustomFieldInstanceSerializer(serializers.ModelSerializer):
value = ReadWriteSerializerMethodField(allow_null=True)
def create(self, validated_data):
type_to_data_store_name_map = {
CustomField.FieldDataType.STRING: "value_text",
CustomField.FieldDataType.URL: "value_url",
CustomField.FieldDataType.DATE: "value_date",
CustomField.FieldDataType.BOOL: "value_bool",
CustomField.FieldDataType.INT: "value_int",
CustomField.FieldDataType.FLOAT: "value_float",
CustomField.FieldDataType.MONETARY: "value_monetary",
CustomField.FieldDataType.DOCUMENTLINK: "value_document_ids",
CustomField.FieldDataType.SELECT: "value_select",
}
# An instance is attached to a document
document: Document = validated_data["document"]
# And to a CustomField
custom_field: CustomField = validated_data["field"]
# This key must exist, as it is validated
data_store_name = type_to_data_store_name_map[custom_field.data_type]
data_store_name = CustomFieldInstance.get_value_field_name(
custom_field.data_type,
)
if custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
# prior to update so we can look for any docs that are going to be removed

View File

@ -0,0 +1,670 @@
import json
import re
from datetime import date
from typing import Callable
from unittest.mock import Mock
from urllib.parse import quote
import pytest
from django.contrib.auth.models import User
from rest_framework.test import APITestCase
from documents.models import CustomField
from documents.models import Document
from documents.serialisers import DocumentSerializer
from documents.tests.utils import DirectoriesMixin
from paperless import settings
class DocumentWrapper:
"""
Allows Pythonic access to the custom fields associated with the wrapped document.
"""
def __init__(self, document: Document) -> None:
self._document = document
def __contains__(self, custom_field: str) -> bool:
return self._document.custom_fields.filter(field__name=custom_field).exists()
def __getitem__(self, custom_field: str):
return self._document.custom_fields.get(field__name=custom_field).value
def string_expr_opted_in(op):
return op in settings.CUSTOM_FIELD_LOOKUP_OPT_IN
class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
def setUp(self):
super().setUp()
self.user = User.objects.create_superuser(username="temp_admin")
self.client.force_authenticate(user=self.user)
# Create one custom field per type. The fields are called f"{type}_field".
self.custom_fields = {}
for data_type in CustomField.FieldDataType.values:
name = data_type + "_field"
self.custom_fields[name] = CustomField.objects.create(
name=name,
data_type=data_type,
)
# Add some options to the select_field
select = self.custom_fields["select_field"]
select.extra_data = {"select_options": ["A", "B", "C"]}
select.save()
# Now we will create some test documents
self.documents = []
# CustomField.FieldDataType.STRING
self._create_document(string_field=None)
self._create_document(string_field="")
self._create_document(string_field="paperless")
self._create_document(string_field="Paperless")
self._create_document(string_field="PAPERLESS")
self._create_document(string_field="pointless")
self._create_document(string_field="pointy")
# CustomField.FieldDataType.URL
self._create_document(url_field=None)
self._create_document(url_field="")
self._create_document(url_field="https://docs.paperless-ngx.com/")
self._create_document(url_field="https://www.django-rest-framework.org/")
self._create_document(url_field="http://example.com/")
# A document to check if the filter correctly associates field names with values.
# E.g., ["url_field", "exact", "https://docs.paperless-ngx.com/"] should not
# yield this document.
self._create_document(
string_field="https://docs.paperless-ngx.com/",
url_field="http://example.com/",
)
# CustomField.FieldDataType.DATE
self._create_document(date_field=None)
self._create_document(date_field=date(2023, 8, 22))
self._create_document(date_field=date(2024, 8, 22))
self._create_document(date_field=date(2024, 11, 15))
# CustomField.FieldDataType.BOOL
self._create_document(boolean_field=None)
self._create_document(boolean_field=True)
self._create_document(boolean_field=False)
# CustomField.FieldDataType.INT
self._create_document(integer_field=None)
self._create_document(integer_field=-1)
self._create_document(integer_field=0)
self._create_document(integer_field=1)
# CustomField.FieldDataType.FLOAT
self._create_document(float_field=None)
self._create_document(float_field=-1e9)
self._create_document(float_field=0.05)
self._create_document(float_field=270.0)
# CustomField.FieldDataType.MONETARY
self._create_document(monetary_field=None)
self._create_document(monetary_field="USD100.00")
self._create_document(monetary_field="USD1.00")
self._create_document(monetary_field="EUR50.00")
# CustomField.FieldDataType.DOCUMENTLINK
self._create_document(documentlink_field=None)
self._create_document(documentlink_field=[])
self._create_document(
documentlink_field=[
self.documents[0].id,
self.documents[1].id,
self.documents[2].id,
],
)
self._create_document(
documentlink_field=[self.documents[4].id, self.documents[5].id],
)
# CustomField.FieldDataType.SELECT
self._create_document(select_field=None)
self._create_document(select_field=0)
self._create_document(select_field=1)
self._create_document(select_field=2)
def _create_document(self, **kwargs):
title = str(kwargs)
document = Document.objects.create(
title=title,
checksum=title,
archive_serial_number=len(self.documents) + 1,
)
data = {
"custom_fields": [
{"field": self.custom_fields[name].id, "value": value}
for name, value in kwargs.items()
],
}
serializer = DocumentSerializer(
document,
data=data,
partial=True,
context={"request": Mock()},
)
serializer.is_valid(raise_exception=True)
serializer.save()
self.documents.append(document)
return document
def _assert_query_match_predicate(
self,
query: list,
reference_predicate: Callable[[DocumentWrapper], bool],
match_nothing_ok=False,
):
"""
Checks the results of the query against a callable reference predicate.
"""
reference_document_ids = [
document.id
for document in self.documents
if reference_predicate(DocumentWrapper(document))
]
# First sanity check our test cases
if not match_nothing_ok:
self.assertTrue(
reference_document_ids,
msg="Bad test case - should match at least one document.",
)
self.assertNotEqual(
len(reference_document_ids),
len(self.documents),
msg="Bad test case - should not match all documents.",
)
# Now make the API call.
query_string = quote(json.dumps(query), safe="")
response = self.client.get(
"/api/documents/?"
+ "&".join(
(
f"custom_field_lookup={query_string}",
"ordering=archive_serial_number",
"page=1",
f"page_size={len(self.documents)}",
"truncate_content=true",
),
),
)
self.assertEqual(response.status_code, 200, msg=str(response.json()))
response_document_ids = [
document["id"] for document in response.json()["results"]
]
self.assertEqual(reference_document_ids, response_document_ids)
def _assert_validation_error(self, query: str, path: list, keyword: str):
"""
Asserts that the query raises a validation error.
Checks the message to make sure it points to the right place.
"""
query_string = quote(query, safe="")
response = self.client.get(
"/api/documents/?"
+ "&".join(
(
f"custom_field_lookup={query_string}",
"ordering=archive_serial_number",
"page=1",
f"page_size={len(self.documents)}",
"truncate_content=true",
),
),
)
self.assertEqual(response.status_code, 400)
exception_path = []
detail = response.json()
while not isinstance(detail, list):
path_item, detail = next(iter(detail.items()))
exception_path.append(path_item)
self.assertEqual(path, exception_path)
self.assertIn(keyword, " ".join(detail))
# ==========================================================#
# Sanity checks #
# ==========================================================#
def test_name_value_association(self):
"""
GIVEN:
- A document with `{"string_field": "https://docs.paperless-ngx.com/",
"url_field": "http://example.com/"}`
WHEN:
- Filtering by `["url_field", "exact", "https://docs.paperless-ngx.com/"]`
THEN:
- That document should not get matched.
"""
self._assert_query_match_predicate(
["url_field", "exact", "https://docs.paperless-ngx.com/"],
lambda document: "url_field" in document
and document["url_field"] == "https://docs.paperless-ngx.com/",
)
def test_filter_by_multiple_fields(self):
"""
GIVEN:
- A document with `{"string_field": "https://docs.paperless-ngx.com/",
"url_field": "http://example.com/"}`
WHEN:
- Filtering by `['AND', [["string_field", "exists", True], ["url_field", "exists", True]]]`
THEN:
- That document should get matched.
"""
self._assert_query_match_predicate(
["AND", [["string_field", "exists", True], ["url_field", "exists", True]]],
lambda document: "url_field" in document and "string_field" in document,
)
# ==========================================================#
# Basic expressions supported by all custom field types #
# ==========================================================#
def test_exact(self):
self._assert_query_match_predicate(
["string_field", "exact", "paperless"],
lambda document: "string_field" in document
and document["string_field"] == "paperless",
)
def test_in(self):
self._assert_query_match_predicate(
["string_field", "in", ["paperless", "Paperless"]],
lambda document: "string_field" in document
and document["string_field"] in ("paperless", "Paperless"),
)
def test_isnull(self):
self._assert_query_match_predicate(
["string_field", "isnull", True],
lambda document: "string_field" in document
and document["string_field"] is None,
)
def test_exists(self):
self._assert_query_match_predicate(
["string_field", "exists", True],
lambda document: "string_field" in document,
)
def test_select(self):
# For select fields, you can either specify the index
# or the name of the option. They function exactly the same.
self._assert_query_match_predicate(
["select_field", "exact", 1],
lambda document: "select_field" in document
and document["select_field"] == 1,
)
# This is the same as:
self._assert_query_match_predicate(
["select_field", "exact", "B"],
lambda document: "select_field" in document
and document["select_field"] == 1,
)
# ==========================================================#
# Expressions for string, URL, and monetary fields #
# ==========================================================#
@pytest.mark.skipif(
not string_expr_opted_in("iexact"),
reason="iexact expr is disabled.",
)
def test_iexact(self):
self._assert_query_match_predicate(
["string_field", "iexact", "paperless"],
lambda document: "string_field" in document
and document["string_field"] is not None
and document["string_field"].lower() == "paperless",
)
@pytest.mark.skipif(
not string_expr_opted_in("contains"),
reason="contains expr is disabled.",
)
def test_contains(self):
# WARNING: SQLite treats "contains" as "icontains"!
# You should avoid "contains" unless you know what you are doing!
self._assert_query_match_predicate(
["string_field", "contains", "aper"],
lambda document: "string_field" in document
and document["string_field"] is not None
and "aper" in document["string_field"],
)
def test_icontains(self):
self._assert_query_match_predicate(
["string_field", "icontains", "aper"],
lambda document: "string_field" in document
and document["string_field"] is not None
and "aper" in document["string_field"].lower(),
)
@pytest.mark.skipif(
not string_expr_opted_in("startswith"),
reason="startswith expr is disabled.",
)
def test_startswith(self):
# WARNING: SQLite treats "startswith" as "istartswith"!
# You should avoid "startswith" unless you know what you are doing!
self._assert_query_match_predicate(
["string_field", "startswith", "paper"],
lambda document: "string_field" in document
and document["string_field"] is not None
and document["string_field"].startswith("paper"),
)
def test_istartswith(self):
self._assert_query_match_predicate(
["string_field", "istartswith", "paper"],
lambda document: "string_field" in document
and document["string_field"] is not None
and document["string_field"].lower().startswith("paper"),
)
@pytest.mark.skipif(
not string_expr_opted_in("endswith"),
reason="endswith expr is disabled.",
)
def test_endswith(self):
# WARNING: SQLite treats "endswith" as "iendswith"!
# You should avoid "endswith" unless you know what you are doing!
self._assert_query_match_predicate(
["string_field", "iendswith", "less"],
lambda document: "string_field" in document
and document["string_field"] is not None
and document["string_field"].lower().endswith("less"),
)
def test_iendswith(self):
self._assert_query_match_predicate(
["string_field", "iendswith", "less"],
lambda document: "string_field" in document
and document["string_field"] is not None
and document["string_field"].lower().endswith("less"),
)
@pytest.mark.skipif(
not string_expr_opted_in("regex"),
reason="regex expr is disabled.",
)
def test_regex(self):
# WARNING: the regex syntax is database dependent!
self._assert_query_match_predicate(
["string_field", "regex", r"^p.+s$"],
lambda document: "string_field" in document
and document["string_field"] is not None
and re.match(r"^p.+s$", document["string_field"]),
)
@pytest.mark.skipif(
not string_expr_opted_in("iregex"),
reason="iregex expr is disabled.",
)
def test_iregex(self):
# WARNING: the regex syntax is database dependent!
self._assert_query_match_predicate(
["string_field", "iregex", r"^p.+s$"],
lambda document: "string_field" in document
and document["string_field"] is not None
and re.match(r"^p.+s$", document["string_field"], re.IGNORECASE),
)
def test_url_field_istartswith(self):
# URL fields supports all of the expressions above.
# Just showing one of them here.
self._assert_query_match_predicate(
["url_field", "istartswith", "http://"],
lambda document: "url_field" in document
and document["url_field"] is not None
and document["url_field"].startswith("http://"),
)
@pytest.mark.skipif(
not string_expr_opted_in("iregex"),
reason="regex expr is disabled.",
)
def test_monetary_field_iregex(self):
# Monetary fields supports all of the expressions above.
# Just showing one of them here.
#
# Unfortunately we can't do arithmetic comparisons on monetary field,
# but you are welcome to use regex to do some of that.
# E.g., USD between 100.00 and 999.99:
self._assert_query_match_predicate(
["monetary_field", "regex", r"USD[1-9][0-9]{2}\.[0-9]{2}"],
lambda document: "monetary_field" in document
and document["monetary_field"] is not None
and re.match(
r"USD[1-9][0-9]{2}\.[0-9]{2}",
document["monetary_field"],
re.IGNORECASE,
),
)
# ==========================================================#
# Arithmetic comparisons #
# ==========================================================#
def test_gt(self):
self._assert_query_match_predicate(
["date_field", "gt", date(2024, 8, 22).isoformat()],
lambda document: "date_field" in document
and document["date_field"] is not None
and document["date_field"] > date(2024, 8, 22),
)
def test_gte(self):
self._assert_query_match_predicate(
["date_field", "gte", date(2024, 8, 22).isoformat()],
lambda document: "date_field" in document
and document["date_field"] is not None
and document["date_field"] >= date(2024, 8, 22),
)
def test_lt(self):
self._assert_query_match_predicate(
["integer_field", "lt", 0],
lambda document: "integer_field" in document
and document["integer_field"] is not None
and document["integer_field"] < 0,
)
def test_lte(self):
self._assert_query_match_predicate(
["integer_field", "lte", 0],
lambda document: "integer_field" in document
and document["integer_field"] is not None
and document["integer_field"] <= 0,
)
def test_range(self):
self._assert_query_match_predicate(
["float_field", "range", [-0.05, 0.05]],
lambda document: "float_field" in document
and document["float_field"] is not None
and -0.05 <= document["float_field"] <= 0.05,
)
def test_date_modifier(self):
# For date fields you can optionally prefix the operator
# with the part of the date you are comparing with.
self._assert_query_match_predicate(
["date_field", "year__gte", 2024],
lambda document: "date_field" in document
and document["date_field"] is not None
and document["date_field"].year >= 2024,
)
# ==========================================================#
# Subset check (document link field only) #
# ==========================================================#
def test_document_link_contains(self):
# Document link field "contains" performs a subset check.
self._assert_query_match_predicate(
["documentlink_field", "contains", [1, 2]],
lambda document: "documentlink_field" in document
and document["documentlink_field"] is not None
and set(document["documentlink_field"]) >= {1, 2},
)
# The order of IDs don't matter - this is the same as above.
self._assert_query_match_predicate(
["documentlink_field", "contains", [2, 1]],
lambda document: "documentlink_field" in document
and document["documentlink_field"] is not None
and set(document["documentlink_field"]) >= {1, 2},
)
def test_document_link_contains_empty_set(self):
# An empty set is a subset of any set.
self._assert_query_match_predicate(
["documentlink_field", "contains", []],
lambda document: "documentlink_field" in document
and document["documentlink_field"] is not None,
)
def test_document_link_contains_no_reverse_link(self):
# An edge case is that the document in the value list
# doesn't have a document link field and thus has no reverse link.
self._assert_query_match_predicate(
["documentlink_field", "contains", [self.documents[6].id]],
lambda document: "documentlink_field" in document
and document["documentlink_field"] is not None
and set(document["documentlink_field"]) >= {self.documents[6].id},
match_nothing_ok=True,
)
# ==========================================================#
# Logical expressions #
# ==========================================================#
def test_logical_and(self):
self._assert_query_match_predicate(
[
"AND",
[["date_field", "year__exact", 2024], ["date_field", "month__lt", 9]],
],
lambda document: "date_field" in document
and document["date_field"] is not None
and document["date_field"].year == 2024
and document["date_field"].month < 9,
)
def test_logical_or(self):
# This is also the recommend way to check for "empty" text, URL, and monetary fields.
self._assert_query_match_predicate(
[
"OR",
[["string_field", "exact", ""], ["string_field", "isnull", True]],
],
lambda document: "string_field" in document
and not bool(document["string_field"]),
)
def test_logical_not(self):
# This means `NOT ((document has string_field) AND (string_field iexact "paperless"))`,
# not `(document has string_field) AND (NOT (string_field iexact "paperless"))`!
self._assert_query_match_predicate(
[
"NOT",
["string_field", "exact", "paperless"],
],
lambda document: not (
"string_field" in document and document["string_field"] == "paperless"
),
)
# ==========================================================#
# Tests for invalid queries #
# ==========================================================#
def test_invalid_json(self):
self._assert_validation_error(
"not valid json",
["custom_field_lookup"],
"must be valid JSON",
)
def test_invalid_expression(self):
self._assert_validation_error(
json.dumps("valid json but not valid expr"),
["custom_field_lookup"],
"Invalid custom field lookup expression",
)
def test_invalid_custom_field_name(self):
self._assert_validation_error(
json.dumps(["invalid name", "iexact", "foo"]),
["custom_field_lookup", "0"],
"is not a valid custom field",
)
def test_invalid_operator(self):
self._assert_validation_error(
json.dumps(["integer_field", "iexact", "foo"]),
["custom_field_lookup", "1"],
"does not support lookup expr",
)
def test_invalid_value(self):
self._assert_validation_error(
json.dumps(["select_field", "exact", "not an option"]),
["custom_field_lookup", "2"],
"integer",
)
def test_invalid_logical_operator(self):
self._assert_validation_error(
json.dumps(["invalid op", ["integer_field", "gt", 0]]),
["custom_field_lookup", "0"],
"Invalid logical operator",
)
def test_invalid_expr_list(self):
self._assert_validation_error(
json.dumps(["AND", "not a list"]),
["custom_field_lookup", "1"],
"Invalid expression list",
)
def test_invalid_operator_prefix(self):
self._assert_validation_error(
json.dumps(["integer_field", "foo__gt", 0]),
["custom_field_lookup", "1"],
"does not support lookup expr",
)
@pytest.mark.skipif(
string_expr_opted_in("regex"),
reason="user opted into allowing regex expr",
)
def test_disabled_operator(self):
self._assert_validation_error(
json.dumps(["string_field", "regex", r"^p.+s$"]),
["custom_field_lookup", "1"],
"disabled by default",
)
def test_query_too_deep(self):
query = ["string_field", "exact", "paperless"]
for _ in range(10):
query = ["NOT", query]
self._assert_validation_error(
json.dumps(query),
["custom_field_lookup", *(["1"] * 10)],
"Maximum nesting depth exceeded",
)
def test_query_too_many_atoms(self):
atom = ["string_field", "exact", "paperless"]
query = ["AND", [atom for _ in range(21)]]
self._assert_validation_error(
json.dumps(query),
["custom_field_lookup", "1", "20"],
"Maximum number of query conditions exceeded",
)

View File

@ -1192,6 +1192,23 @@ EMAIL_ENABLE_GPG_DECRYPTOR: Final[bool] = __get_boolean(
###############################################################################
# Soft Delete
# Soft Delete #
###############################################################################
EMPTY_TRASH_DELAY = max(__get_int("PAPERLESS_EMPTY_TRASH_DELAY", 30), 1)
###############################################################################
# custom_field_lookup Filter Settings #
###############################################################################
CUSTOM_FIELD_LOOKUP_OPT_IN = __get_list(
"PAPERLESS_CUSTOM_FIELD_LOOKUP_OPT_IN",
default=[],
)
CUSTOM_FIELD_LOOKUP_MAX_DEPTH = __get_int(
"PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_DEPTH",
default=10,
)
CUSTOM_FIELD_LOOKUP_MAX_ATOMS = __get_int(
"PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_ATOMS",
default=20,
)