Detect and reset invalid ASNs to 0 during indexing with a loud error to the user

This commit is contained in:
Trenton H 2023-02-02 08:19:59 -08:00
parent a203b006e7
commit 0f536a9b9a
4 changed files with 86 additions and 6 deletions

View File

@ -146,11 +146,16 @@ class Consumer(LoggingMixin):
return
# Validate the range is above zero and less than uint32_t max
# otherwise, Whoosh can't handle it in the index
if self.override_asn < 0 or self.override_asn > 0xFF_FF_FF_FF:
if (
self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
self._fail(
MESSAGE_ASN_RANGE,
f"Not consuming {self.filename}: "
f"Given ASN {self.override_asn} is out of range [0, 4,294,967,295]",
f"Given ASN {self.override_asn} is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]",
)
if Document.objects.filter(archive_serial_number=self.override_asn).exists():
self._fail(

View File

@ -90,10 +90,22 @@ def open_index_searcher():
searcher.close()
def update_document(writer, doc):
def update_document(writer: AsyncWriter, doc: Document):
tags = ",".join([t.name for t in doc.tags.all()])
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
asn = doc.archive_serial_number
if asn is not None and (
asn < Document.ARCHIVE_SERIAL_NUMBER_MIN
or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
logger.error(
f"Not indexing Archive Serial Number {asn} of document {doc.pk}. "
f"ASN is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.",
)
asn = 0
writer.update_document(
id=doc.pk,
title=doc.title,
@ -109,7 +121,7 @@ def update_document(writer, doc):
has_type=doc.document_type is not None,
created=doc.created,
added=doc.added,
asn=doc.archive_serial_number,
asn=asn,
modified=doc.modified,
path=doc.storage_path.name if doc.storage_path else None,
path_id=doc.storage_path.id if doc.storage_path else None,

View File

@ -3,6 +3,7 @@ import logging
import os
import re
from collections import OrderedDict
from typing import Final
from typing import Optional
import dateutil.parser
@ -229,6 +230,9 @@ class Document(models.Model):
help_text=_("The original name of the file when it was uploaded"),
)
ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0
ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF
archive_serial_number = models.PositiveIntegerField(
_("archive serial number"),
blank=True,
@ -236,8 +240,8 @@ class Document(models.Model):
unique=True,
db_index=True,
validators=[
MaxValueValidator(0xFF_FF_FF_FF),
MinValueValidator(0),
MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX),
MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN),
],
help_text=_(
"The position of this document in your physical document " "archive.",

View File

@ -1,3 +1,5 @@
from unittest import mock
from django.test import TestCase
from documents import index
from documents.models import Document
@ -31,3 +33,60 @@ class TestAutoComplete(DirectoriesMixin, TestCase):
)
self.assertListEqual(index.autocomplete(ix, "tes", limit=1), [b"test3"])
self.assertListEqual(index.autocomplete(ix, "tes", limit=0), [])
def test_archive_serial_number_ranging(self):
"""
GIVEN:
- Document with an archive serial number above schema allowed size
WHEN:
- Document is provided to the index
THEN:
- Error is logged
- Document ASN is reset to 0 for the index
"""
doc1 = Document.objects.create(
title="doc1",
checksum="A",
content="test test2 test3",
# yes, this is allowed, unless full_clean is run
# DRF does call the validators, this test won't
archive_serial_number=Document.ARCHIVE_SERIAL_NUMBER_MAX + 1,
)
with self.assertLogs("paperless.index", level="ERROR") as cm:
with mock.patch(
"documents.index.AsyncWriter.update_document",
) as mocked_update_doc:
index.add_or_update_document(doc1)
mocked_update_doc.assert_called_once()
_, kwargs = mocked_update_doc.call_args
self.assertEqual(kwargs["asn"], 0)
error_str = cm.output[0]
expected_str = "ERROR:paperless.index:Not indexing Archive Serial Number 4294967296 of document 1"
self.assertIn(expected_str, error_str)
def test_archive_serial_number_is_none(self):
"""
GIVEN:
- Document with no archive serial number
WHEN:
- Document is provided to the index
THEN:
- ASN isn't touched
"""
doc1 = Document.objects.create(
title="doc1",
checksum="A",
content="test test2 test3",
)
with mock.patch(
"documents.index.AsyncWriter.update_document",
) as mocked_update_doc:
index.add_or_update_document(doc1)
mocked_update_doc.assert_called_once()
_, kwargs = mocked_update_doc.call_args
self.assertIsNone(kwargs["asn"])