Fix: utc-normalize natural dates for whoosh

This commit is contained in:
shamoon 2025-07-19 21:00:07 -07:00
parent 4b8f6ed643
commit 9f55626ba6
2 changed files with 86 additions and 0 deletions

View File

@ -2,10 +2,12 @@ from __future__ import annotations
import logging import logging
import math import math
import re
from collections import Counter from collections import Counter
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime from datetime import datetime
from datetime import time from datetime import time
from datetime import timedelta
from datetime import timezone from datetime import timezone
from shutil import rmtree from shutil import rmtree
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@ -13,6 +15,8 @@ from typing import Literal
from django.conf import settings from django.conf import settings
from django.utils import timezone as django_timezone from django.utils import timezone as django_timezone
from django.utils.timezone import get_current_timezone
from django.utils.timezone import now
from guardian.shortcuts import get_users_with_perms from guardian.shortcuts import get_users_with_perms
from whoosh import classify from whoosh import classify
from whoosh import highlight from whoosh import highlight
@ -344,6 +348,7 @@ class LocalDateParser(English):
class DelayedFullTextQuery(DelayedQuery): class DelayedFullTextQuery(DelayedQuery):
def _get_query(self) -> tuple: def _get_query(self) -> tuple:
q_str = self.query_params["query"] q_str = self.query_params["query"]
q_str = rewrite_natural_date_keywords(q_str)
qp = MultifieldParser( qp = MultifieldParser(
[ [
"content", "content",
@ -450,3 +455,47 @@ def get_permissions_criterias(user: User | None = None) -> list:
query.Term("viewer_id", str(user.id)), query.Term("viewer_id", str(user.id)),
) )
return user_criterias return user_criterias
def rewrite_natural_date_keywords(query_string: str) -> str:
"""
Rewrites `added:today`, `created:yesterday` into whoosh datetime ranges.
This prevents UTC confusion when searching with natural language date keywords.
"""
replacements = {}
patterns = [
("added:today", "added"),
("added:yesterday", "added"),
("created:today", "created"),
("created:yesterday", "created"),
]
tz = get_current_timezone()
local_now = now().astimezone(tz)
today_start_local = datetime.combine(local_now.date(), time.min).replace(tzinfo=tz)
today_end_local = datetime.combine(local_now.date(), time.max).replace(tzinfo=tz)
yesterday_start_local = today_start_local - timedelta(days=1)
yesterday_end_local = today_end_local - timedelta(days=1)
for pattern, field in patterns:
if pattern in query_string:
if pattern.endswith("today"):
start = today_start_local
end = today_end_local
else:
start = yesterday_start_local
end = yesterday_end_local
start_str = start.astimezone(timezone.utc).strftime("%Y%m%d%H%M%S")
end_str = end.astimezone(timezone.utc).strftime("%Y%m%d%H%M%S")
range_expr = f"{field}:[{start_str} TO {end_str}]"
logger.warning(f"RANGE: {range_expr}")
replacements[pattern] = range_expr
for match, replacement in replacements.items():
query_string = re.sub(rf"\b{re.escape(match)}\b", replacement, query_string)
return query_string

View File

@ -1,6 +1,11 @@
from datetime import datetime
from unittest import mock from unittest import mock
from django.contrib.auth.models import User
from django.test import TestCase from django.test import TestCase
from django.test import override_settings
from django.utils.timezone import get_current_timezone
from django.utils.timezone import timezone
from documents import index from documents import index
from documents.models import Document from documents.models import Document
@ -90,3 +95,35 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
_, kwargs = mocked_update_doc.call_args _, kwargs = mocked_update_doc.call_args
self.assertIsNone(kwargs["asn"]) self.assertIsNone(kwargs["asn"])
@override_settings(TIME_ZONE="Pacific/Auckland")
def test_added_today_respects_local_timezone_boundary(self):
tz = get_current_timezone()
fixed_now = datetime(2025, 7, 20, 15, 0, 0, tzinfo=tz)
# Fake a time near the local boundary (1 AM NZT = 13:00 UTC on previous UTC day)
local_dt = datetime(2025, 7, 20, 1, 0, 0).replace(tzinfo=tz)
utc_dt = local_dt.astimezone(timezone.utc)
doc = Document.objects.create(
title="Time zone",
content="Testing added:today",
checksum="edgecase123",
added=utc_dt,
)
with index.open_index_writer() as writer:
index.update_document(writer, doc)
superuser = User.objects.create_superuser(username="testuser")
self.client.force_login(superuser)
with mock.patch("documents.index.now", return_value=fixed_now):
response = self.client.get("/api/documents/?query=added:today")
results = response.json()["results"]
self.assertEqual(len(results), 1)
self.assertEqual(results[0]["id"], doc.id)
response = self.client.get("/api/documents/?query=added:yesterday")
results = response.json()["results"]
self.assertEqual(len(results), 0)