Detect and reset invalid ASNs to 0 during indexing with a loud error to the user

This commit is contained in:
Trenton H 2023-02-02 08:19:59 -08:00
parent a203b006e7
commit 0f536a9b9a
4 changed files with 86 additions and 6 deletions

View File

@ -146,11 +146,16 @@ class Consumer(LoggingMixin):
return return
# Validate the range is above zero and less than uint32_t max # Validate the range is above zero and less than uint32_t max
# otherwise, Whoosh can't handle it in the index # otherwise, Whoosh can't handle it in the index
if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF: if (
self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
self._fail( self._fail(
MESSAGE_ASN_RANGE, MESSAGE_ASN_RANGE,
f"Not consuming {self.filename}: " f"Not consuming {self.filename}: "
f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]", f"Given ASN {self.override_asn} is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
) )
if Document.objects.filter(archive_serial_number=self.override_asn).exists(): if Document.objects.filter(archive_serial_number=self.override_asn).exists():
self._fail( self._fail(

View File

@ -90,10 +90,22 @@ def open_index_searcher():
searcher.close() searcher.close()
def update_document(writer, doc): def update_document(writer: AsyncWriter, doc: Document):
tags = ",".join([t.name for t in doc.tags.all()]) tags = ",".join([t.name for t in doc.tags.all()])
tags_ids = ",".join([str(t.id) for t in doc.tags.all()]) tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)]) comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
asn = doc.archive_serial_number
if asn is not None and (
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
logger.error(
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
f"ASN is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
)
asn = 0
writer.update_document( writer.update_document(
id=doc.pk, id=doc.pk,
title=doc.title, title=doc.title,
@ -109,7 +121,7 @@ def update_document(writer, doc):
has_type=doc.document_type is not None, has_type=doc.document_type is not None,
created=doc.created, created=doc.created,
added=doc.added, added=doc.added,
asn=doc.archive_serial_number, asn=asn,
modified=doc.modified, modified=doc.modified,
path=doc.storage_path.name if doc.storage_path else None, path=doc.storage_path.name if doc.storage_path else None,
path_id=doc.storage_path.id if doc.storage_path else None, path_id=doc.storage_path.id if doc.storage_path else None,

View File

@ -3,6 +3,7 @@ import logging
import os import os
import re import re
from collections import OrderedDict from collections import OrderedDict
from typing import Final
from typing import Optional from typing import Optional
import dateutil.parser import dateutil.parser
@ -229,6 +230,9 @@ class Document(models.Model):
help_text=_("The original name of the file when it was uploaded"), help_text=_("The original name of the file when it was uploaded"),
) )
ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0
ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF
archive_serial_number = models.PositiveIntegerField( archive_serial_number = models.PositiveIntegerField(
_("archive serial number"), _("archive serial number"),
blank=True, blank=True,
@ -236,8 +240,8 @@ class Document(models.Model):
unique=True, unique=True,
db_index=True, db_index=True,
validators=[ validators=[
MaxValueValidator(0xFF_FF_FF_FF), MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX),
MinValueValidator(0), MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN),
], ],
help_text=_( help_text=_(
"The position of this document in your physical document " "archive.", "The position of this document in your physical document " "archive.",

View File

@ -1,3 +1,5 @@
from unittest import mock
from django.test import TestCase from django.test import TestCase
from documents import index from documents import index
from documents.models import Document from documents.models import Document
@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
) )
self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"]) self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
self.assertListEqual(index.autocomplete(ix, "tes", limit=0), []) self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
def test_archive_serial_number_ranging(self):
"""
GIVEN:
- Document with an archive serial number above schema allowed size
WHEN:
- Document is provided to the index
THEN:
- Error is logged
- Document ASN is reset to 0 for the index
"""
doc1 = Document.objects.create(
title="doc1",
checksum="A",
content="test test2 test3",
# yes, this is allowed, unless full_clean is run
# DRF does call the validators, this test won't
archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
)
with self.assertLogs("paperless.index", level="ERROR") as cm:
with mock.patch(
"documents.index.AsyncWriter.update_document",
) as mocked_update_doc:
index.add_or_update_document(doc1)
mocked_update_doc.assert_called_once()
_, kwargs = mocked_update_doc.call_args
self.assertEqual(kwargs["asn"], 0)
error_str = cm.output[0]
expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
self.assertIn(expected_str, error_str)
def test_archive_serial_number_is_none(self):
"""
GIVEN:
- Document with no archive serial number
WHEN:
- Document is provided to the index
THEN:
- ASN isn't touched
"""
doc1 = Document.objects.create(
title="doc1",
checksum="A",
content="test test2 test3",
)
with mock.patch(
"documents.index.AsyncWriter.update_document",
) as mocked_update_doc:
index.add_or_update_document(doc1)
mocked_update_doc.assert_called_once()
_, kwargs = mocked_update_doc.call_args
self.assertIsNone(kwargs["asn"])