diff --git a/src/documents/index.py b/src/documents/index.py index 6b994ac8c..ea26ea926 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -10,6 +10,7 @@ from datetime import time from datetime import timedelta from datetime import timezone from shutil import rmtree +from time import sleep from typing import TYPE_CHECKING from typing import Literal @@ -32,6 +33,7 @@ from whoosh.highlight import HtmlFormatter from whoosh.idsets import BitSet from whoosh.idsets import DocIdSet from whoosh.index import FileIndex +from whoosh.index import LockError from whoosh.index import create_in from whoosh.index import exists_in from whoosh.index import open_dir @@ -97,11 +99,33 @@ def get_schema() -> Schema: def open_index(*, recreate=False) -> FileIndex: - try: - if exists_in(settings.INDEX_DIR) and not recreate: - return open_dir(settings.INDEX_DIR, schema=get_schema()) - except Exception: - logger.exception("Error while opening the index, recreating.") + transient_exceptions = (FileNotFoundError, LockError) + max_retries = 3 + retry_delay = 0.1 + + for attempt in range(max_retries + 1): + try: + if exists_in(settings.INDEX_DIR) and not recreate: + return open_dir(settings.INDEX_DIR, schema=get_schema()) + break + except transient_exceptions as exc: + is_last_attempt = attempt == max_retries or recreate + if is_last_attempt: + logger.exception( + "Error while opening the index after retries, recreating.", + ) + break + + logger.warning( + "Transient error while opening the index (attempt %s/%s): %s. Retrying.", + attempt + 1, + max_retries + 1, + exc, + ) + sleep(retry_delay) + except Exception: + logger.exception("Error while opening the index, recreating.") + break # create_in doesn't handle corrupted indexes very well, remove the directory entirely first if settings.INDEX_DIR.is_dir(): diff --git a/src/documents/tests/test_index.py b/src/documents/tests/test_index.py index f216feedb..3167bb762 100644 --- a/src/documents/tests/test_index.py +++ b/src/documents/tests/test_index.py @@ -1,6 +1,7 @@ from datetime import datetime from unittest import mock +from django.conf import settings from django.contrib.auth.models import User from django.test import SimpleTestCase from django.test import TestCase @@ -251,3 +252,120 @@ class TestRewriteNaturalDateKeywords(SimpleTestCase): result = self._rewrite_with_now("added:today", fixed_now) # Should convert to UTC properly self.assertIn("added:[20250719", result) + + +class TestIndexResilience(DirectoriesMixin, SimpleTestCase): + def _assert_recreate_called(self, mock_create_in): + mock_create_in.assert_called_once() + path_arg, schema_arg = mock_create_in.call_args.args + self.assertEqual(path_arg, settings.INDEX_DIR) + self.assertEqual(schema_arg.__class__.__name__, "Schema") + + def test_transient_missing_segment_does_not_force_recreate(self): + """ + GIVEN: + - Index directory exists + WHEN: + - open_index is called + - Opening the index raises FileNotFoundError once due to a + transient missing segment + THEN: + - Index is opened successfully on retry + - Index is not recreated + """ + file_marker = settings.INDEX_DIR / "file_marker.txt" + file_marker.write_text("keep") + expected_index = object() + + with ( + mock.patch("documents.index.exists_in", return_value=True), + mock.patch( + "documents.index.open_dir", + side_effect=[FileNotFoundError("missing"), expected_index], + ) as mock_open_dir, + mock.patch( + "documents.index.create_in", + ) as mock_create_in, + mock.patch( + "documents.index.rmtree", + ) as mock_rmtree, + ): + ix = index.open_index() + + self.assertIs(ix, expected_index) + self.assertGreaterEqual(mock_open_dir.call_count, 2) + mock_rmtree.assert_not_called() + mock_create_in.assert_not_called() + self.assertEqual(file_marker.read_text(), "keep") + + def test_transient_errors_exhaust_retries_and_recreate(self): + """ + GIVEN: + - Index directory exists + WHEN: + - open_index is called + - Opening the index raises FileNotFoundError multiple times due to + transient missing segments + THEN: + - Index is recreated after retries are exhausted + """ + recreated_index = object() + + with ( + self.assertLogs("paperless.index", level="ERROR") as cm, + mock.patch("documents.index.exists_in", return_value=True), + mock.patch( + "documents.index.open_dir", + side_effect=FileNotFoundError("missing"), + ) as mock_open_dir, + mock.patch("documents.index.rmtree") as mock_rmtree, + mock.patch( + "documents.index.create_in", + return_value=recreated_index, + ) as mock_create_in, + ): + ix = index.open_index() + + self.assertIs(ix, recreated_index) + self.assertEqual(mock_open_dir.call_count, 4) + mock_rmtree.assert_called_once_with(settings.INDEX_DIR) + self._assert_recreate_called(mock_create_in) + self.assertIn( + "Error while opening the index after retries, recreating.", + cm.output[0], + ) + + def test_non_transient_error_recreates_index(self): + """ + GIVEN: + - Index directory exists + WHEN: + - open_index is called + - Opening the index raises a "non-transient" error + THEN: + - Index is recreated + """ + recreated_index = object() + + with ( + self.assertLogs("paperless.index", level="ERROR") as cm, + mock.patch("documents.index.exists_in", return_value=True), + mock.patch( + "documents.index.open_dir", + side_effect=RuntimeError("boom"), + ), + mock.patch("documents.index.rmtree") as mock_rmtree, + mock.patch( + "documents.index.create_in", + return_value=recreated_index, + ) as mock_create_in, + ): + ix = index.open_index() + + self.assertIs(ix, recreated_index) + mock_rmtree.assert_called_once_with(settings.INDEX_DIR) + self._assert_recreate_called(mock_create_in) + self.assertIn( + "Error while opening the index, recreating.", + cm.output[0], + )