diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 8c80304d3..1896415b1 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -146,11 +146,16 @@ class Consumer(LoggingMixin): return # Validate the range is above zero and less than uint32_t max # otherwise, Whoosh can't handle it in the index - if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF: + if ( + self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN + or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX + ): self._fail( MESSAGE_ASN_RANGE, f"Not consuming {self.filename}: " - f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]", + f"Given ASN {self.override_asn} is out of range " + f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, " + f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]", ) if Document.objects.filter(archive_serial_number=self.override_asn).exists(): self._fail( diff --git a/src/documents/index.py b/src/documents/index.py index 575e57e8b..e11708f45 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -90,10 +90,22 @@ def open_index_searcher(): searcher.close() -def update_document(writer, doc): +def update_document(writer: AsyncWriter, doc: Document): tags = ",".join([t.name for t in doc.tags.all()]) tags_ids = ",".join([str(t.id) for t in doc.tags.all()]) comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)]) + asn = doc.archive_serial_number + if asn is not None and ( + asn < Document.ARCHIVE_SERIAL_NUMBER_MIN + or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX + ): + logger.error( + f"Not indexing Archive Serial Number {asn} of document {doc.pk}. " + f"ASN is out of range " + f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, " + f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.", + ) + asn = 0 writer.update_document( id=doc.pk, title=doc.title, @@ -109,7 +121,7 @@ def update_document(writer, doc): has_type=doc.document_type is not None, created=doc.created, added=doc.added, - asn=doc.archive_serial_number, + asn=asn, modified=doc.modified, path=doc.storage_path.name if doc.storage_path else None, path_id=doc.storage_path.id if doc.storage_path else None, diff --git a/src/documents/models.py b/src/documents/models.py index 84e96a79a..a3c7cc4e6 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -3,6 +3,7 @@ import logging import os import re from collections import OrderedDict +from typing import Final from typing import Optional import dateutil.parser @@ -229,6 +230,9 @@ class Document(models.Model): help_text=_("The original name of the file when it was uploaded"), ) + ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0 + ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF + archive_serial_number = models.PositiveIntegerField( _("archive serial number"), blank=True, @@ -236,8 +240,8 @@ class Document(models.Model): unique=True, db_index=True, validators=[ - MaxValueValidator(0xFF_FF_FF_FF), - MinValueValidator(0), + MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX), + MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN), ], help_text=_( "The position of this document in your physical document " "archive.", diff --git a/src/documents/tests/test_index.py b/src/documents/tests/test_index.py index 696648427..bf1865a43 100644 --- a/src/documents/tests/test_index.py +++ b/src/documents/tests/test_index.py @@ -1,3 +1,5 @@ +from unittest import mock + from django.test import TestCase from documents import index from documents.models import Document @@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase): ) self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"]) self.assertListEqual(index.autocomplete(ix, "tes", limit=0), []) + + def test_archive_serial_number_ranging(self): + """ + GIVEN: + - Document with an archive serial number above schema allowed size + WHEN: + - Document is provided to the index + THEN: + - Error is logged + - Document ASN is reset to 0 for the index + """ + doc1 = Document.objects.create( + title="doc1", + checksum="A", + content="test test2 test3", + # yes, this is allowed, unless full_clean is run + # DRF does call the validators, this test won't + archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1, + ) + with self.assertLogs("paperless.index", level="ERROR") as cm: + with mock.patch( + "documents.index.AsyncWriter.update_document", + ) as mocked_update_doc: + index.add_or_update_document(doc1) + + mocked_update_doc.assert_called_once() + _, kwargs = mocked_update_doc.call_args + + self.assertEqual(kwargs["asn"], 0) + + error_str = cm.output[0] + expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1" + self.assertIn(expected_str, error_str) + + def test_archive_serial_number_is_none(self): + """ + GIVEN: + - Document with no archive serial number + WHEN: + - Document is provided to the index + THEN: + - ASN isn't touched + """ + doc1 = Document.objects.create( + title="doc1", + checksum="A", + content="test test2 test3", + ) + with mock.patch( + "documents.index.AsyncWriter.update_document", + ) as mocked_update_doc: + index.add_or_update_document(doc1) + + mocked_update_doc.assert_called_once() + _, kwargs = mocked_update_doc.call_args + + self.assertIsNone(kwargs["asn"])