Compare commits

..

12 Commits

Author SHA1 Message Date
shamoon
e6da2a94d1 coverage, unify error tests 2025-09-15 01:23:04 -07:00
shamoon
8d0581177e Update serialisers.py 2025-09-15 00:19:48 -07:00
shamoon
3ef0b89e6c typo 2025-09-15 00:01:19 -07:00
shamoon
42463d68a0 simplify 2025-09-14 23:56:36 -07:00
shamoon
4c2e361762 Update views.py 2025-09-14 23:54:35 -07:00
shamoon
10c254e96d Maybe handle this backwards compat 2025-09-14 22:57:04 -07:00
shamoon
90b2f694c0 Update api.md 2025-09-14 22:22:53 -07:00
shamoon
c02907ff37 Update serialisers.py 2025-09-14 22:21:54 -07:00
shamoon
a2d89e7633 Actually lets unify it 2025-09-14 22:19:36 -07:00
shamoon
1d6cdf7b1d Use json str, normalize keys 2025-09-14 21:37:37 -07:00
shamoon
5a8b470673 Add negative test 2025-09-14 19:47:57 -07:00
shamoon
b2f1c5a6af Enhancement: support custom field values with post document endpoint 2025-09-14 19:38:52 -07:00
6 changed files with 149 additions and 18 deletions

View File

@@ -192,8 +192,8 @@ The endpoint supports the following optional form fields:
- `tags`: Similar to correspondent. Specify this multiple times to
have multiple tags added to the document.
- `archive_serial_number`: An optional archive serial number to set.
- `custom_fields`: An array of custom field ids to assign (with an empty
value) to the document.
- `custom_fields`: Either an array of custom field ids to assign (with an empty
value) to the document or an object mapping field id -> value.
The endpoint will immediately return HTTP 200 if the document consumption
process was started successfully, with the UUID of the consumption task

View File

@@ -50,7 +50,7 @@ dependencies = [
"jinja2~=3.1.5",
"langdetect~=1.0.9",
"nltk~=3.9.1",
"ocrmypdf~=16.11.0",
"ocrmypdf~=16.10.0",
"pathvalidate~=3.3.1",
"pdf2image~=1.17.0",
"psycopg-pool",

View File

@@ -1668,9 +1668,8 @@ class PostDocumentSerializer(serializers.Serializer):
max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX,
)
custom_fields = serializers.PrimaryKeyRelatedField(
many=True,
queryset=CustomField.objects.all(),
# Accept either a list of custom field ids or a dict mapping id -> value
custom_fields = serializers.JSONField(
label="Custom fields",
write_only=True,
required=False,
@@ -1727,11 +1726,60 @@ class PostDocumentSerializer(serializers.Serializer):
return None
def validate_custom_fields(self, custom_fields):
if custom_fields:
return [custom_field.id for custom_field in custom_fields]
else:
if not custom_fields:
return None
# Normalize single values to a list
if isinstance(custom_fields, int):
custom_fields = [custom_fields]
if isinstance(custom_fields, dict):
custom_field_serializer = CustomFieldInstanceSerializer()
normalized = {}
for field_id, value in custom_fields.items():
try:
field_id_int = int(field_id)
except (TypeError, ValueError):
raise serializers.ValidationError(
_("Custom field id must be an integer: %(id)s")
% {"id": field_id},
)
try:
field = CustomField.objects.get(id=field_id_int)
except CustomField.DoesNotExist:
raise serializers.ValidationError(
_("Custom field with id %(id)s does not exist")
% {"id": field_id_int},
)
custom_field_serializer.validate(
{
"field": field,
"value": value,
},
)
normalized[field_id_int] = value
return normalized
elif isinstance(custom_fields, list):
try:
ids = [int(i) for i in custom_fields]
except (TypeError, ValueError):
raise serializers.ValidationError(
_(
"Custom fields must be a list of integers or an object mapping ids to values.",
),
)
if CustomField.objects.filter(id__in=ids).count() != len(set(ids)):
raise serializers.ValidationError(
_("Some custom fields don't exist or were specified twice."),
)
return ids
raise serializers.ValidationError(
_(
"Custom fields must be a list of integers or an object mapping ids to values.",
),
)
# custom_fields_w_values handled via validate_custom_fields
def validate_created(self, created):
# support datetime format for created for backwards compatibility
if isinstance(created, datetime):

View File

@@ -1,4 +1,5 @@
import datetime
import json
import shutil
import tempfile
import uuid
@@ -1537,6 +1538,86 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
overrides.update(new_overrides)
self.assertEqual(overrides.custom_fields, {cf.id: None, cf2.id: 123})
def test_upload_with_custom_field_values(self):
"""
GIVEN: A document with a source file
WHEN: Upload the document with custom fields and values
THEN: Metadata is set correctly
"""
self.consume_file_mock.return_value = celery.result.AsyncResult(
id=str(uuid.uuid4()),
)
cf_string = CustomField.objects.create(
name="stringfield",
data_type=CustomField.FieldDataType.STRING,
)
cf_int = CustomField.objects.create(
name="intfield",
data_type=CustomField.FieldDataType.INT,
)
with (Path(__file__).parent / "samples" / "simple.pdf").open("rb") as f:
response = self.client.post(
"/api/documents/post_document/",
{
"document": f,
"custom_fields": json.dumps(
{
str(cf_string.id): "a string",
str(cf_int.id): 123,
},
),
},
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.consume_file_mock.assert_called_once()
input_doc, overrides = self.get_last_consume_delay_call_args()
self.assertEqual(input_doc.original_file.name, "simple.pdf")
self.assertEqual(overrides.filename, "simple.pdf")
self.assertEqual(
overrides.custom_fields,
{cf_string.id: "a string", cf_int.id: 123},
)
def test_upload_with_custom_fields_errors(self):
"""
GIVEN: A document with a source file
WHEN: Upload the document with invalid custom fields payloads
THEN: The upload is rejected
"""
self.consume_file_mock.return_value = celery.result.AsyncResult(
id=str(uuid.uuid4()),
)
error_payloads = [
# Non-integer key in mapping
{"custom_fields": json.dumps({"abc": "a string"})},
# List with non-integer entry
{"custom_fields": json.dumps(["abc"])},
# Nonexistent id in mapping
{"custom_fields": json.dumps({99999999: "a string"})},
# Nonexistent id in list
{"custom_fields": json.dumps([99999999])},
# Invalid type (JSON string, not list/dict/int)
{"custom_fields": json.dumps("not-a-supported-structure")},
]
for payload in error_payloads:
with (Path(__file__).parent / "samples" / "simple.pdf").open("rb") as f:
data = {"document": f, **payload}
response = self.client.post(
"/api/documents/post_document/",
data,
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.consume_file_mock.assert_not_called()
def test_upload_with_webui_source(self):
"""
GIVEN: A document with a source file

View File

@@ -1497,7 +1497,7 @@ class PostDocumentView(GenericAPIView):
title = serializer.validated_data.get("title")
created = serializer.validated_data.get("created")
archive_serial_number = serializer.validated_data.get("archive_serial_number")
custom_field_ids = serializer.validated_data.get("custom_fields")
cf = serializer.validated_data.get("custom_fields")
from_webui = serializer.validated_data.get("from_webui")
t = int(mktime(datetime.now().timetuple()))
@@ -1516,6 +1516,11 @@ class PostDocumentView(GenericAPIView):
source=DocumentSource.WebUI if from_webui else DocumentSource.ApiUpload,
original_file=temp_file_path,
)
custom_fields = None
if isinstance(cf, dict) and cf:
custom_fields = cf
elif isinstance(cf, list) and cf:
custom_fields = dict.fromkeys(cf, None)
input_doc_overrides = DocumentMetadataOverrides(
filename=doc_name,
title=title,
@@ -1526,10 +1531,7 @@ class PostDocumentView(GenericAPIView):
created=created,
asn=archive_serial_number,
owner_id=request.user.id,
# TODO: set values
custom_fields={cf_id: None for cf_id in custom_field_ids}
if custom_field_ids
else None,
custom_fields=custom_fields,
)
async_task = consume_file.delay(

8
uv.lock generated
View File

@@ -1982,7 +1982,7 @@ wheels = [
[[package]]
name = "ocrmypdf"
version = "16.11.0"
version = "16.10.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "deprecation", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -1995,9 +1995,9 @@ dependencies = [
{ name = "pluggy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rich", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/44/af/947d6abb0cb41f99971a7a4bd33684d3cee20c9e32c8f9dc90e8c5dcf21c/ocrmypdf-16.11.0.tar.gz", hash = "sha256:d89077e503238dac35c6e565925edc8d98b71e5289853c02cacbc1d0901f1be7", size = 7015068, upload-time = "2025-09-12T08:36:53.507Z" }
sdist = { url = "https://files.pythonhosted.org/packages/cd/40/cb85e6260e5a20d08195d03541b31db4296f8f4d3442ee595686f47a75b0/ocrmypdf-16.10.4.tar.gz", hash = "sha256:de749ef5f554b63d57e68d032e7cba5500cbd5030835bf24f658f7b7a04f3dc1", size = 7003649, upload-time = "2025-07-07T20:55:01.735Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/b2/eda3bb0939bf81d889812dd82cf37fa6f8769af8e31008bd586ba12fae09/ocrmypdf-16.11.0-py3-none-any.whl", hash = "sha256:13628294a309c85b21947b5c7bc7fcd202464517c14b71a050adc9dde85c48f7", size = 162883, upload-time = "2025-09-12T08:36:51.611Z" },
{ url = "https://files.pythonhosted.org/packages/8e/6a/53bb2b0e57f8ca8d4a021194202cc772d1ce049269e9b0cb88d1fa87a0ef/ocrmypdf-16.10.4-py3-none-any.whl", hash = "sha256:061f3165d09ffafac975cea00803802b8a75551ada9965292ea86ea382673688", size = 162559, upload-time = "2025-07-07T20:55:00.061Z" },
]
[[package]]
@@ -2194,7 +2194,7 @@ requires-dist = [
{ name = "langdetect", specifier = "~=1.0.9" },
{ name = "mysqlclient", marker = "extra == 'mariadb'", specifier = "~=2.2.7" },
{ name = "nltk", specifier = "~=3.9.1" },
{ name = "ocrmypdf", specifier = "~=16.11.0" },
{ name = "ocrmypdf", specifier = "~=16.10.0" },
{ name = "pathvalidate", specifier = "~=3.3.1" },
{ name = "pdf2image", specifier = "~=1.17.0" },
{ name = "psycopg", extras = ["c", "pool"], marker = "extra == 'postgres'", specifier = "==3.2.9" },