Merge branch 'dev' into feature-ai

2025-11-23 23:49:08 -06:00 · 2025-11-17 18:49:57 -08:00
parent 02da33697b 56493d6640
commit bedd048dfd
21 changed files with 276 additions and 156 deletions
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -287,15 +287,75 @@ class DelayedQuery:
        self.first_score = None
        self.filter_queryset = filter_queryset
        self.suggested_correction = None
+        self._manual_hits_cache: list | None = None

    def __len__(self) -> int:
+        if self._manual_sort_requested():
+            manual_hits = self._manual_hits()
+            return len(manual_hits)
+
        page = self[0:1]
        return len(page)

+    def _manual_sort_requested(self):
+        ordering = self.query_params.get("ordering", "")
+        return ordering.lstrip("-").startswith("custom_field_")
+
+    def _manual_hits(self):
+        if self._manual_hits_cache is None:
+            q, mask, suggested_correction = self._get_query()
+            self.suggested_correction = suggested_correction
+
+            results = self.searcher.search(
+                q,
+                mask=mask,
+                filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
+                limit=None,
+            )
+            results.fragmenter = highlight.ContextFragmenter(surround=50)
+            results.formatter = HtmlFormatter(tagname="span", between=" ... ")
+
+            if not self.first_score and len(results) > 0:
+                self.first_score = results[0].score
+
+            if self.first_score:
+                results.top_n = [
+                    (
+                        (hit[0] / self.first_score) if self.first_score else None,
+                        hit[1],
+                    )
+                    for hit in results.top_n
+                ]
+
+            hits_by_id = {hit["id"]: hit for hit in results}
+            matching_ids = list(hits_by_id.keys())
+
+            ordered_ids = list(
+                self.filter_queryset.filter(id__in=matching_ids).values_list(
+                    "id",
+                    flat=True,
+                ),
+            )
+            ordered_ids = list(dict.fromkeys(ordered_ids))
+
+            self._manual_hits_cache = [
+                hits_by_id[_id] for _id in ordered_ids if _id in hits_by_id
+            ]
+        return self._manual_hits_cache
+
    def __getitem__(self, item):
        if item.start in self.saved_results:
            return self.saved_results[item.start]

+        if self._manual_sort_requested():
+            manual_hits = self._manual_hits()
+            start = 0 if item.start is None else item.start
+            stop = item.stop
+            hits = manual_hits[start:stop] if stop is not None else manual_hits[start:]
+            page = ManualResultsPage(hits)
+            self.saved_results[start] = page
+            return page
+
        q, mask, suggested_correction = self._get_query()
        self.suggested_correction = suggested_correction
        sortedby, reverse = self._get_query_sortedby()
@@ -315,21 +375,33 @@ class DelayedQuery:
        if not self.first_score and len(page.results) > 0 and sortedby is None:
            self.first_score = page.results[0].score

-        page.results.top_n = list(
-            map(
-                lambda hit: (
-                    (hit[0] / self.first_score) if self.first_score else None,
-                    hit[1],
-                ),
-                page.results.top_n,
-            ),
-        )
+        page.results.top_n = [
+            (
+                (hit[0] / self.first_score) if self.first_score else None,
+                hit[1],
+            )
+            for hit in page.results.top_n
+        ]

        self.saved_results[item.start] = page

        return page


+class ManualResultsPage(list):
+    def __init__(self, hits):
+        super().__init__(hits)
+        self.results = ManualResults(hits)
+
+
+class ManualResults:
+    def __init__(self, hits):
+        self._docnums = [hit.docnum for hit in hits]
+
+    def docs(self):
+        return self._docnums
+
+
 class LocalDateParser(English):
    def reverse_timezone_offset(self, d):
        return (d.replace(tzinfo=django_timezone.get_current_timezone())).astimezone(
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -48,12 +48,13 @@ if settings.AUDIT_LOG_ENABLED:


@contextmanager
-def disable_signal(sig, receiver, sender) -> Generator:
+def disable_signal(sig, receiver, sender, *, weak: bool | None = None) -> Generator:
    try:
        sig.disconnect(receiver=receiver, sender=sender)
        yield
    finally:
-        sig.connect(receiver=receiver, sender=sender)
+        kwargs = {"weak": weak} if weak is not None else {}
+        sig.connect(receiver=receiver, sender=sender, **kwargs)


 class Command(CryptMixin, BaseCommand):
@@ -258,16 +259,19 @@ class Command(CryptMixin, BaseCommand):
                post_save,
                receiver=update_filename_and_move_files,
                sender=Document,
+                weak=False,
            ),
            disable_signal(
                m2m_changed,
                receiver=update_filename_and_move_files,
                sender=Document.tags.through,
+                weak=False,
            ),
            disable_signal(
                post_save,
                receiver=update_filename_and_move_files,
                sender=CustomFieldInstance,
+                weak=False,
            ),
            disable_signal(
                post_save,
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -396,9 +396,9 @@ class CannotMoveFilesException(Exception):


 # should be disabled in /src/documents/management/commands/document_importer.py handle
-@receiver(models.signals.post_save, sender=CustomFieldInstance)
-@receiver(models.signals.m2m_changed, sender=Document.tags.through)
-@receiver(models.signals.post_save, sender=Document)
+@receiver(models.signals.post_save, sender=CustomFieldInstance, weak=False)
+@receiver(models.signals.m2m_changed, sender=Document.tags.through, weak=False)
+@receiver(models.signals.post_save, sender=Document, weak=False)
 def update_filename_and_move_files(
    sender,
    instance: Document | CustomFieldInstance,
--- a/src/documents/tests/test_api_search.py
+++ b/src/documents/tests/test_api_search.py
@@ -89,6 +89,65 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
        self.assertEqual(len(results), 0)
        self.assertCountEqual(response.data["all"], [])

+    def test_search_custom_field_ordering(self):
+        custom_field = CustomField.objects.create(
+            name="Sortable field",
+            data_type=CustomField.FieldDataType.INT,
+        )
+        d1 = Document.objects.create(
+            title="first",
+            content="match",
+            checksum="A1",
+        )
+        d2 = Document.objects.create(
+            title="second",
+            content="match",
+            checksum="B2",
+        )
+        d3 = Document.objects.create(
+            title="third",
+            content="match",
+            checksum="C3",
+        )
+        CustomFieldInstance.objects.create(
+            document=d1,
+            field=custom_field,
+            value_int=30,
+        )
+        CustomFieldInstance.objects.create(
+            document=d2,
+            field=custom_field,
+            value_int=10,
+        )
+        CustomFieldInstance.objects.create(
+            document=d3,
+            field=custom_field,
+            value_int=20,
+        )
+
+        with AsyncWriter(index.open_index()) as writer:
+            index.update_document(writer, d1)
+            index.update_document(writer, d2)
+            index.update_document(writer, d3)
+
+        response = self.client.get(
+            f"/api/documents/?query=match&ordering=custom_field_{custom_field.pk}",
+        )
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(
+            [doc["id"] for doc in response.data["results"]],
+            [d2.id, d3.id, d1.id],
+        )
+
+        response = self.client.get(
+            f"/api/documents/?query=match&ordering=-custom_field_{custom_field.pk}",
+        )
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(
+            [doc["id"] for doc in response.data["results"]],
+            [d1.id, d3.id, d2.id],
+        )
+
    def test_search_multi_page(self):
        with AsyncWriter(index.open_index()) as writer:
            for i in range(55):
--- a/src/paperless/tests/test_adapter.py
+++ b/src/paperless/tests/test_adapter.py
@@ -54,8 +54,8 @@ class TestCustomAccountAdapter(TestCase):
            # False because request host is not in allowed hosts
            self.assertFalse(adapter.is_safe_url(url))

-    @mock.patch("allauth.core.ratelimit._consume_rate", return_value=True)
-    def test_pre_authenticate(self, mock_consume_rate):
+    @mock.patch("allauth.core.internal.ratelimit.consume", return_value=True)
+    def test_pre_authenticate(self, mock_consume):
        adapter = get_adapter()
        request = HttpRequest()
        request.get_host = mock.Mock(return_value="example.com")
--- a/src/paperless/version.py
+++ b/src/paperless/version.py
@@ -1,6 +1,6 @@
 from typing import Final

-__version__: Final[tuple[int, int, int]] = (2, 19, 5)
+__version__: Final[tuple[int, int, int]] = (2, 19, 6)
 # Version string like X.Y.Z
 __full_version_str__: Final[str] = ".".join(map(str, __version__))
 # Version string like X.Y
--- a/src/paperless_mail/tests/samples/html.eml
+++ b/src/paperless_mail/tests/samples/html.eml
@@ -55,7 +55,7 @@ Content-Transfer-Encoding: 7bit
 		<p>Some Text</p>
 		<p>
 			<img src="cid:part1.pNdUSz0s.D3NqVtPg@example.de" alt="Has to be rewritten to work..">
-			<img src="https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png" alt="This image should not be shown.">
+			<img src="https://docs.paperless-ngx.com/assets/logo_full_white.svg" alt="This image should not be shown.">
 		</p>

 		<p>and an embedded image.<br>
--- a/src/paperless_mail/tests/samples/sample.html
+++ b/src/paperless_mail/tests/samples/sample.html
@@ -6,7 +6,7 @@
      <p>Some Text</p>
      <p>
 				<img src="cid:part1.pNdUSz0s.D3NqVtPg@example.de" alt="Has to be rewritten to work..">
-				<img src="https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png" alt="This image should not be shown.">
+				<img src="https://docs.paperless-ngx.com/assets/logo_full_white.svg" alt="This image should not be shown.">
 			</p>

      <p>and an embedded image.<br>
--- a/src/paperless_mail/tests/test_parsers_live.py
+++ b/src/paperless_mail/tests/test_parsers_live.py
@@ -2,7 +2,6 @@ import os
 import shutil
 import subprocess
 import tempfile
-import time
 from pathlib import Path

 import httpx
@@ -54,34 +53,6 @@ class TestUrlCanary:
    Verify certain URLs are still available so testing is valid still
    """

-    @classmethod
-    def _fetch_wikimedia(cls, url: str) -> httpx.Response:
-        """
-        Wikimedia occasionally throttles automated requests (HTTP 429). Retry a few
-        times with a short backoff so the tests stay stable, and skip if throttling
-        persists.
-        """
-        last_resp: httpx.Response | None = None
-        # Wikimedia rejects requests without a browser-like User-Agent header and returns 403.
-        headers = {
-            "User-Agent": (
-                "Mozilla/5.0 (X11; Linux x86_64) "
-                "AppleWebKit/537.36 (KHTML, like Gecko) "
-                "Chrome/123.0.0.0 Safari/537.36"
-            ),
-        }
-        for delay in (0, 1, 2):
-            resp = httpx.get(url, headers=headers, timeout=30.0)
-            if resp.status_code != httpx.codes.TOO_MANY_REQUESTS:
-                return resp
-            last_resp = resp
-            time.sleep(delay)
-
-        pytest.skip(
-            "Wikimedia throttled the canary request with HTTP 429; try rerunning later.",
-        )
-        return last_resp  # pragma: no cover
-
    def test_online_image_exception_on_not_available(self):
        """
        GIVEN:
@@ -96,8 +67,8 @@ class TestUrlCanary:
        whether this image stays online forever, so here we check if we can detect if is not
        available anymore.
        """
-        resp = self._fetch_wikimedia(
-            "https://upload.wikimedia.org/wikipedia/en/f/f7/nonexistent.png",
+        resp = httpx.get(
+            "https://docs.paperless-ngx.com/assets/non-existent.png",
        )
        with pytest.raises(httpx.HTTPStatusError) as exec_info:
            resp.raise_for_status()
@@ -119,8 +90,8 @@ class TestUrlCanary:
        """

        # Now check the URL used in samples/sample.html
-        resp = self._fetch_wikimedia(
-            "https://upload.wikimedia.org/wikipedia/en/f/f7/RickRoll.png",
+        resp = httpx.get(
+            "https://docs.paperless-ngx.com/assets/logo_full_white.svg",
        )
        resp.raise_for_status()