Merge branch 'dev' into dependabot/uv/dev/django-a060cb4303

2025-08-10 00:18:57 +00:00 · 2025-08-06 16:13:57 -04:00
parent ad6eb2c98c 1bee1495cf
commit 31457c35d0
9 changed files with 395 additions and 70 deletions
--- a/src/documents/tests/samples/content.txt
+++ b/src/documents/tests/samples/content.txt
@@ -0,0 +1,34 @@
+Sample textual document content.
+Include as many characters as possible, to check the classifier's vectorization.
+
+Hey 00, this is "a" test0707 content.
+This is an example document — created on 2025-06-25.
+
+Digits: 0123456789
+Punctuation: . , ; : ! ? ' " ( ) [ ] { } — – …
+English text: The quick brown fox jumps over the lazy dog.
+English stop words: We’ve been doing it before.
+Accented Latin (diacritics): àâäæçéèêëîïôœùûüÿñ
+Arabic: لقد قام المترجم بعمل جيد
+Greek: Αλφα, Βήτα, Γάμμα, Δέλτα, Ωμέγα
+Cyrillic: Привет, как дела? Добро пожаловать!
+Chinese (Simplified): 你好，世界！今天的天气很好。
+Chinese (Traditional): 歡迎來到世界，今天天氣很好。
+Japanese (Kanji, Hiragana, Katakana): 東京へ行きます。カタカナ、ひらがな、漢字。
+Korean (Hangul): 안녕하세요. 오늘 날씨 어때요?
+Arabic: مرحبًا، كيف حالك؟
+Hebrew: שלום, מה שלומך?
+Emoji: 😀 🐍 📘 ✅ ©️ 🇺🇳
+Symbols: © ® ™ § ¶ † ‡ ∞ µ ∑ ∆ √
+Math: ∫₀^∞ x² dx = ∞, π ≈ 3.14159, ∇·E = ρ/ε₀
+Currency: 1$ € ¥ £ ₹
+Date formats: 25/06/2025, June 25, 2025, 2025年6月25日
+Quote in French: « Bonjour, ça va ? »
+Quote in German: „Guten Tag! Wie geht's?“
+Newline test:
+\r\n
+\r
+
+Tab\ttest\tspacing
+/ = +) ( []) ~ * #192 +33601010101 § ¤
+End of document.
--- a/src/documents/tests/samples/preprocessed_content.txt
+++ b/src/documents/tests/samples/preprocessed_content.txt
@@ -0,0 +1 @@
+sample textual document content include as many characters as possible to check the classifier s vectorization hey 00 this is a test0707 content this is an example document created on 2025 06 25 digits 0123456789 punctuation english text the quick brown fox jumps over the lazy dog english stop words we ve been doing it before accented latin diacritics àâäæçéèêëîïôœùûüÿñ arabic لقد قام المترجم بعمل جيد greek αλφα βήτα γάμμα δέλτα ωμέγα cyrillic привет как дела добро пожаловать chinese simplified 你好 世界 今天的天气很好 chinese traditional 歡迎來到世界 今天天氣很好 japanese kanji hiragana katakana 東京へ行きます カタカナ ひらがな 漢字 korean hangul 안녕하세요 오늘 날씨 어때요 arabic مرحب ا كيف حالك hebrew שלום מה שלומך emoji symbols µ math ₀ x² dx π 3 14159 e ρ ε₀ currency 1 date formats 25 06 2025 june 25 2025 2025年6月25日 quote in french bonjour ça va quote in german guten tag wie geht s newline test r n r tab ttest tspacing 192 33601010101 end of document
--- a/src/documents/tests/samples/preprocessed_content_advanced.txt
+++ b/src/documents/tests/samples/preprocessed_content_advanced.txt
@@ -0,0 +1 @@
+sampl textual document content includ mani charact possibl check classifi vector hey 00 test0707 content exampl document creat 2025 06 25 digit 0123456789 punctuat english text quick brown fox jump lazi dog english stop word accent latin diacrit àâäæçéèêëîïôœùûüÿñ arab لقد قام المترجم بعمل جيد greek αλφα βήτα γάμμα δέλτα ωμέγα cyril привет как дела добро пожаловать chines simplifi 你好 世界 今天的天气很好 chines tradit 歡迎來到世界 今天天氣很好 japanes kanji hiragana katakana 東京へ行きます カタカナ ひらがな 漢字 korean hangul 안녕하세요 오늘 날씨 어때요 arab مرحب ا كيف حالك hebrew שלום מה שלומך emoji symbol µ math ₀ x² dx π 3 14159 e ρ ε₀ currenc 1 date format 25 06 2025 june 25 2025 2025年6月25日 quot french bonjour ça va quot german guten tag wie geht newlin test r n r tab ttest tspace 192 33601010101 end document
--- a/src/documents/tests/test_caching.py
+++ b/src/documents/tests/test_caching.py
@@ -0,0 +1,45 @@
+import pickle
+
+from documents.caching import StoredLRUCache
+
+
+def test_lru_cache_entries():
+    CACHE_TTL = 1
+    # LRU cache with a capacity of 2 elements
+    cache = StoredLRUCache("test_lru_cache_key", 2, backend_ttl=CACHE_TTL)
+    cache.set(1, 1)
+    cache.set(2, 2)
+    assert cache.get(2) == 2
+    assert cache.get(1) == 1
+
+    # The oldest entry (2) should be removed
+    cache.set(3, 3)
+    assert cache.get(3) == 3
+    assert not cache.get(2)
+    assert cache.get(1) == 1
+
+    # Save the cache, restore it and check it overwrites the current cache in memory
+    cache.save()
+    cache.set(4, 4)
+    assert not cache.get(3)
+    cache.load()
+    assert not cache.get(4)
+    assert cache.get(3) == 3
+    assert cache.get(1) == 1
+
+
+def test_stored_lru_cache_key_ttl(mocker):
+    mock_backend = mocker.Mock()
+    cache = StoredLRUCache("test_key", backend=mock_backend, backend_ttl=321)
+
+    # Simulate storing values
+    cache.set("x", "X")
+    cache.set("y", "Y")
+    cache.save()
+
+    # Assert backend.set was called with pickled data, key and TTL
+    mock_backend.set.assert_called_once()
+    key, data, timeout = mock_backend.set.call_args[0]
+    assert key == "test_key"
+    assert timeout == 321
+    assert pickle.loads(data) == {"x": "X", "y": "Y"}
--- a/src/documents/tests/test_classifier.py
+++ b/src/documents/tests/test_classifier.py
@@ -21,7 +21,7 @@ from documents.models import Tag
 from documents.tests.utils import DirectoriesMixin


-def dummy_preprocess(content: str):
+def dummy_preprocess(content: str, **kwargs):
    """
    Simpler, faster pre-processing for testing purposes
    """
@@ -223,24 +223,47 @@ class TestClassifier(DirectoriesMixin, TestCase):
        self.generate_test_data()
        self.classifier.train()

-        self.assertEqual(
-            self.classifier.predict_correspondent(self.doc1.content),
-            self.c1.pk,
-        )
-        self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
-        self.assertListEqual(
-            self.classifier.predict_tags(self.doc1.content),
-            [self.t1.pk],
-        )
-        self.assertListEqual(
-            self.classifier.predict_tags(self.doc2.content),
-            [self.t1.pk, self.t3.pk],
-        )
-        self.assertEqual(
-            self.classifier.predict_document_type(self.doc1.content),
-            self.dt.pk,
-        )
-        self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
+        with (
+            mock.patch.object(
+                self.classifier.data_vectorizer,
+                "transform",
+                wraps=self.classifier.data_vectorizer.transform,
+            ) as mock_transform,
+            mock.patch.object(
+                self.classifier,
+                "preprocess_content",
+                wraps=self.classifier.preprocess_content,
+            ) as mock_preprocess_content,
+        ):
+            self.assertEqual(
+                self.classifier.predict_correspondent(self.doc1.content),
+                self.c1.pk,
+            )
+            self.assertEqual(
+                self.classifier.predict_correspondent(self.doc2.content),
+                None,
+            )
+            self.assertListEqual(
+                self.classifier.predict_tags(self.doc1.content),
+                [self.t1.pk],
+            )
+            self.assertListEqual(
+                self.classifier.predict_tags(self.doc2.content),
+                [self.t1.pk, self.t3.pk],
+            )
+            self.assertEqual(
+                self.classifier.predict_document_type(self.doc1.content),
+                self.dt.pk,
+            )
+            self.assertEqual(
+                self.classifier.predict_document_type(self.doc2.content),
+                None,
+            )
+
+            # Check that the classifier vectorized content and text preprocessing has been cached
+            # It should be called once per document (doc1 and doc2)
+            self.assertEqual(mock_preprocess_content.call_count, 2)
+            self.assertEqual(mock_transform.call_count, 2)

    def test_no_retrain_if_no_change(self):
        """
@@ -694,3 +717,67 @@ class TestClassifier(DirectoriesMixin, TestCase):
        mock_load.side_effect = Exception()
        with self.assertRaises(Exception):
            load_classifier(raise_exception=True)
+
+
+def test_preprocess_content():
+    """
+    GIVEN:
+        - Advanced text processing is enabled (default)
+    WHEN:
+        - Classifier preprocesses a document's content
+    THEN:
+        - Processed content matches the expected output (stemmed words)
+    """
+    with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
+        content = f.read()
+    with (Path(__file__).parent / "samples" / "preprocessed_content_advanced.txt").open(
+        "r",
+    ) as f:
+        expected_preprocess_content = f.read().rstrip()
+    classifier = DocumentClassifier()
+    result = classifier.preprocess_content(content)
+    assert result == expected_preprocess_content
+
+
+def test_preprocess_content_nltk_disabled():
+    """
+    GIVEN:
+        - Advanced text processing is disabled
+    WHEN:
+        - Classifier preprocesses a document's content
+    THEN:
+        - Processed content matches the expected output (unstemmed words)
+    """
+    with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
+        content = f.read()
+    with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
+        "r",
+    ) as f:
+        expected_preprocess_content = f.read().rstrip()
+    classifier = DocumentClassifier()
+    with mock.patch("documents.classifier.ADVANCED_TEXT_PROCESSING_ENABLED", new=False):
+        result = classifier.preprocess_content(content)
+    assert result == expected_preprocess_content
+
+
+def test_preprocess_content_nltk_load_fail(mocker):
+    """
+    GIVEN:
+        - NLTK stop words fail to load
+    WHEN:
+        - Classifier preprocesses a document's content
+    THEN:
+        - Processed content matches the expected output (unstemmed words)
+    """
+    _module = mocker.MagicMock(name="nltk_corpus_mock")
+    _module.stopwords.words.side_effect = AttributeError()
+    mocker.patch.dict("sys.modules", {"nltk.corpus": _module})
+    classifier = DocumentClassifier()
+    with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
+        content = f.read()
+    with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
+        "r",
+    ) as f:
+        expected_preprocess_content = f.read().rstrip()
+    result = classifier.preprocess_content(content)
+    assert result == expected_preprocess_content
				`@@ -0,0 +1 @@`
				sample textual document content include as many characters as possible to check the classifier s vectorization hey 00 this is a test0707 content this is an example document created on 2025 06 25 digits 0123456789 punctuation english text the quick brown fox jumps over the lazy dog english stop words we ve been doing it before accented latin diacritics àâäæçéèêëîïôœùûüÿñ arabic لقد قام المترجم بعمل جيد greek αλφα βήτα γάμμα δέλτα ωμέγα cyrillic привет как дела добро пожаловать chinese simplified 你好世界今天的天气很好 chinese traditional 歡迎來到世界今天天氣很好 japanese kanji hiragana katakana 東京へ行きますカタカナひらがな漢字 korean hangul 안녕하세요 오늘 날씨 어때요 arabic مرحب ا كيف حالك hebrew שלום מה שלומך emoji symbols µ math ₀ x² dx π 3 14159 e ρ ε₀ currency 1 date formats 25 06 2025 june 25 2025 2025年6月25日 quote in french bonjour ça va quote in german guten tag wie geht s newline test r n r tab ttest tspacing 192 33601010101 end of document
				`@@ -0,0 +1 @@`
				sampl textual document content includ mani charact possibl check classifi vector hey 00 test0707 content exampl document creat 2025 06 25 digit 0123456789 punctuat english text quick brown fox jump lazi dog english stop word accent latin diacrit àâäæçéèêëîïôœùûüÿñ arab لقد قام المترجم بعمل جيد greek αλφα βήτα γάμμα δέλτα ωμέγα cyril привет как дела добро пожаловать chines simplifi 你好世界今天的天气很好 chines tradit 歡迎來到世界今天天氣很好 japanes kanji hiragana katakana 東京へ行きますカタカナひらがな漢字 korean hangul 안녕하세요 오늘 날씨 어때요 arab مرحب ا كيف حالك hebrew שלום מה שלומך emoji symbol µ math ₀ x² dx π 3 14159 e ρ ε₀ currenc 1 date format 25 06 2025 june 25 2025 2025年6月25日 quot french bonjour ça va quot german guten tag wie geht newlin test r n r tab ttest tspace 192 33601010101 end document