better sanity checker that logs messages in the log files and does not fail on warnings.

This commit is contained in:
jonaswinkler 2021-02-14 17:08:29 +01:00
parent df6c59bc4f
commit 98b147b622
7 changed files with 212 additions and 122 deletions

View File

@ -10,8 +10,10 @@ paperless-ng 1.1.2
* Always show top left corner of thumbnails, even for extra wide documents. * Always show top left corner of thumbnails, even for extra wide documents.
* Added a management command for executing the sanity checker directly. See :ref:`utilities-sanity-checker`. * Added a management command for executing the sanity checker directly.
See :ref:`utilities-sanity-checker`.
The sanity checker will also report errors in the log files.
* Fixed an issue with the metadata tab not reporting anything in case of missing files. * Fixed an issue with the metadata tab not reporting anything in case of missing files.
* Reverted a change from 1.1.0 that caused huge memory usage due to redis caching. * Reverted a change from 1.1.0 that caused huge memory usage due to redis caching.

View File

@ -1,8 +1,5 @@
import logging
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from documents.sanity_checker import check_sanity, SanityError, SanityWarning from documents.sanity_checker import check_sanity
logger = logging.getLogger("paperless.management.sanity_checker")
class Command(BaseCommand): class Command(BaseCommand):
@ -15,13 +12,4 @@ class Command(BaseCommand):
messages = check_sanity(progress=True) messages = check_sanity(progress=True)
if len(messages) == 0: messages.log_messages()
logger.info("No issues found.")
else:
for msg in messages:
if type(msg) == SanityError:
logger.error(str(msg))
elif type(msg) == SanityWarning:
logger.warning(str(msg))
else:
logger.info((str(msg)))

View File

@ -1,4 +1,5 @@
import hashlib import hashlib
import logging
import os import os
from django.conf import settings from django.conf import settings
@ -7,40 +8,48 @@ from tqdm import tqdm
from documents.models import Document from documents.models import Document
class SanityMessage: class SanityCheckMessages:
message = None
def __init__(self):
self._messages = []
def error(self, message):
self._messages.append({"level": logging.ERROR, "message": message})
def warning(self, message):
self._messages.append({"level": logging.WARNING, "message": message})
def info(self, message):
self._messages.append({"level": logging.INFO, "message": message})
def log_messages(self):
logger = logging.getLogger("paperless.sanity_checker")
if len(self._messages) == 0:
logger.info("Sanity checker detected no issues.")
else:
for msg in self._messages:
logger.log(msg['level'], msg['message'])
def __len__(self):
return len(self._messages)
def __getitem__(self, item):
return self._messages[item]
def has_error(self):
return any([msg['level'] == logging.ERROR for msg in self._messages])
def has_warning(self):
return any([msg['level'] == logging.WARNING for msg in self._messages])
class SanityWarning(SanityMessage): class SanityCheckFailedException(Exception):
def __init__(self, message): pass
self.message = message
def __str__(self):
return f"Warning: {self.message}"
class SanityError(SanityMessage):
def __init__(self, message):
self.message = message
def __str__(self):
return f"ERROR: {self.message}"
class SanityFailedError(Exception):
def __init__(self, messages):
self.messages = messages
def __str__(self):
message_string = "\n".join([str(m) for m in self.messages])
return (
f"The following issuse were found by the sanity checker:\n"
f"{message_string}\n\n===============\n\n")
def check_sanity(progress=False): def check_sanity(progress=False):
messages = [] messages = SanityCheckMessages()
present_files = [] present_files = []
for root, subdirs, files in os.walk(settings.MEDIA_ROOT): for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
@ -59,8 +68,7 @@ def check_sanity(progress=False):
for doc in docs: for doc in docs:
# Check sanity of the thumbnail # Check sanity of the thumbnail
if not os.path.isfile(doc.thumbnail_path): if not os.path.isfile(doc.thumbnail_path):
messages.append(SanityError( messages.error(f"Thumbnail of document {doc.pk} does not exist.")
f"Thumbnail of document {doc.pk} does not exist."))
else: else:
if os.path.normpath(doc.thumbnail_path) in present_files: if os.path.normpath(doc.thumbnail_path) in present_files:
present_files.remove(os.path.normpath(doc.thumbnail_path)) present_files.remove(os.path.normpath(doc.thumbnail_path))
@ -68,15 +76,14 @@ def check_sanity(progress=False):
with doc.thumbnail_file as f: with doc.thumbnail_file as f:
f.read() f.read()
except OSError as e: except OSError as e:
messages.append(SanityError( messages.error(
f"Cannot read thumbnail file of document {doc.pk}: {e}" f"Cannot read thumbnail file of document {doc.pk}: {e}"
)) )
# Check sanity of the original file # Check sanity of the original file
# TODO: extract method # TODO: extract method
if not os.path.isfile(doc.source_path): if not os.path.isfile(doc.source_path):
messages.append(SanityError( messages.error(f"Original of document {doc.pk} does not exist.")
f"Original of document {doc.pk} does not exist."))
else: else:
if os.path.normpath(doc.source_path) in present_files: if os.path.normpath(doc.source_path) in present_files:
present_files.remove(os.path.normpath(doc.source_path)) present_files.remove(os.path.normpath(doc.source_path))
@ -84,31 +91,31 @@ def check_sanity(progress=False):
with doc.source_file as f: with doc.source_file as f:
checksum = hashlib.md5(f.read()).hexdigest() checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e: except OSError as e:
messages.append(SanityError( messages.error(
f"Cannot read original file of document {doc.pk}: {e}")) f"Cannot read original file of document {doc.pk}: {e}")
else: else:
if not checksum == doc.checksum: if not checksum == doc.checksum:
messages.append(SanityError( messages.error(
f"Checksum mismatch of document {doc.pk}. " f"Checksum mismatch of document {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}." f"Stored: {doc.checksum}, actual: {checksum}."
)) )
# Check sanity of the archive file. # Check sanity of the archive file.
if doc.archive_checksum and not doc.archive_filename: if doc.archive_checksum and not doc.archive_filename:
messages.append(SanityError( messages.error(
f"Document {doc.pk} has an archive file checksum, but no " f"Document {doc.pk} has an archive file checksum, but no "
f"archive filename." f"archive filename."
)) )
elif not doc.archive_checksum and doc.archive_filename: elif not doc.archive_checksum and doc.archive_filename:
messages.append(SanityError( messages.error(
f"Document {doc.pk} has an archive file, but its checksum is " f"Document {doc.pk} has an archive file, but its checksum is "
f"missing." f"missing."
)) )
elif doc.has_archive_version: elif doc.has_archive_version:
if not os.path.isfile(doc.archive_path): if not os.path.isfile(doc.archive_path):
messages.append(SanityError( messages.error(
f"Archived version of document {doc.pk} does not exist." f"Archived version of document {doc.pk} does not exist."
)) )
else: else:
if os.path.normpath(doc.archive_path) in present_files: if os.path.normpath(doc.archive_path) in present_files:
present_files.remove(os.path.normpath(doc.archive_path)) present_files.remove(os.path.normpath(doc.archive_path))
@ -116,26 +123,23 @@ def check_sanity(progress=False):
with doc.archive_file as f: with doc.archive_file as f:
checksum = hashlib.md5(f.read()).hexdigest() checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e: except OSError as e:
messages.append(SanityError( messages.error(
f"Cannot read archive file of document {doc.pk}: {e}" f"Cannot read archive file of document {doc.pk}: {e}"
)) )
else: else:
if not checksum == doc.archive_checksum: if not checksum == doc.archive_checksum:
messages.append(SanityError( messages.error(
f"Checksum mismatch of archived document " f"Checksum mismatch of archived document "
f"{doc.pk}. " f"{doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}." f"Stored: {doc.archive_checksum}, "
)) f"actual: {checksum}."
)
# other document checks # other document checks
if not doc.content: if not doc.content:
messages.append(SanityWarning( messages.info(f"Document {doc.pk} has no content.")
f"Document {doc.pk} has no content."
))
for extra_file in present_files: for extra_file in present_files:
messages.append(SanityWarning( messages.warning(f"Orphaned file in media dir: {extra_file}")
f"Orphaned file in media dir: {extra_file}"
))
return messages return messages

View File

@ -9,8 +9,7 @@ from documents import index, sanity_checker
from documents.classifier import DocumentClassifier, load_classifier from documents.classifier import DocumentClassifier, load_classifier
from documents.consumer import Consumer, ConsumerError from documents.consumer import Consumer, ConsumerError
from documents.models import Document, Tag, DocumentType, Correspondent from documents.models import Document, Tag, DocumentType, Correspondent
from documents.sanity_checker import SanityFailedError from documents.sanity_checker import SanityCheckFailedException
logger = logging.getLogger("paperless.tasks") logger = logging.getLogger("paperless.tasks")
@ -94,8 +93,15 @@ def consume_file(path,
def sanity_check(): def sanity_check():
messages = sanity_checker.check_sanity() messages = sanity_checker.check_sanity()
if len(messages) > 0: messages.log_messages()
raise SanityFailedError(messages)
if messages.has_error():
raise SanityCheckFailedException(
"Sanity check failed with errors. See log.")
elif messages.has_warning():
return "Sanity check exited with warnings. See log."
elif len(messages) > 0:
return "Sanity check exited with infos. See log."
else: else:
return "No issues detected." return "No issues detected."

View File

@ -159,33 +159,20 @@ class TestCreateClassifier(TestCase):
class TestSanityChecker(DirectoriesMixin, TestCase): class TestSanityChecker(DirectoriesMixin, TestCase):
def test_no_errors(self): def test_no_issues(self):
with self.assertLogs() as capture: with self.assertLogs() as capture:
call_command("document_sanity_checker") call_command("document_sanity_checker")
self.assertEqual(len(capture.output), 1) self.assertEqual(len(capture.output), 1)
self.assertIn("No issues found.", capture.output[0]) self.assertIn("Sanity checker detected no issues.", capture.output[0])
@mock.patch("documents.management.commands.document_sanity_checker.logger.warning") def test_errors(self):
@mock.patch("documents.management.commands.document_sanity_checker.logger.error")
def test_warnings(self, error, warning):
doc = Document.objects.create(title="test", filename="test.pdf", checksum="d41d8cd98f00b204e9800998ecf8427e")
Path(doc.source_path).touch()
Path(doc.thumbnail_path).touch()
call_command("document_sanity_checker")
error.assert_not_called()
warning.assert_called()
@mock.patch("documents.management.commands.document_sanity_checker.logger.warning")
@mock.patch("documents.management.commands.document_sanity_checker.logger.error")
def test_errors(self, error, warning):
doc = Document.objects.create(title="test", content="test", filename="test.pdf", checksum="abc") doc = Document.objects.create(title="test", content="test", filename="test.pdf", checksum="abc")
Path(doc.source_path).touch() Path(doc.source_path).touch()
Path(doc.thumbnail_path).touch() Path(doc.thumbnail_path).touch()
call_command("document_sanity_checker") with self.assertLogs() as capture:
call_command("document_sanity_checker")
warning.assert_not_called() self.assertEqual(len(capture.output), 1)
error.assert_called() self.assertIn("Checksum mismatch of document", capture.output[0])

View File

@ -1,3 +1,4 @@
import logging
import os import os
import shutil import shutil
from pathlib import Path from pathlib import Path
@ -7,10 +8,59 @@ from django.conf import settings
from django.test import TestCase from django.test import TestCase
from documents.models import Document from documents.models import Document
from documents.sanity_checker import check_sanity, SanityFailedError from documents.sanity_checker import check_sanity, SanityCheckMessages
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
class TestSanityCheckMessages(TestCase):
def test_no_messages(self):
messages = SanityCheckMessages()
self.assertEqual(len(messages), 0)
self.assertFalse(messages.has_error())
self.assertFalse(messages.has_warning())
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.INFO)
self.assertEqual(capture.records[0].message, "Sanity checker detected no issues.")
def test_info(self):
messages = SanityCheckMessages()
messages.info("Something might be wrong")
self.assertEqual(len(messages), 1)
self.assertFalse(messages.has_error())
self.assertFalse(messages.has_warning())
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.INFO)
self.assertEqual(capture.records[0].message, "Something might be wrong")
def test_warning(self):
messages = SanityCheckMessages()
messages.warning("Something is wrong")
self.assertEqual(len(messages), 1)
self.assertFalse(messages.has_error())
self.assertTrue(messages.has_warning())
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.WARNING)
self.assertEqual(capture.records[0].message, "Something is wrong")
def test_error(self):
messages = SanityCheckMessages()
messages.error("Something is seriously wrong")
self.assertEqual(len(messages), 1)
self.assertTrue(messages.has_error())
self.assertFalse(messages.has_warning())
with self.assertLogs() as capture:
messages.log_messages()
self.assertEqual(len(capture.output), 1)
self.assertEqual(capture.records[0].levelno, logging.ERROR)
self.assertEqual(capture.records[0].message, "Something is seriously wrong")
class TestSanityCheck(DirectoriesMixin, TestCase): class TestSanityCheck(DirectoriesMixin, TestCase):
def make_test_data(self): def make_test_data(self):
@ -33,72 +83,105 @@ class TestSanityCheck(DirectoriesMixin, TestCase):
def test_no_thumbnail(self): def test_no_thumbnail(self):
doc = self.make_test_data() doc = self.make_test_data()
os.remove(doc.thumbnail_path) os.remove(doc.thumbnail_path)
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Thumbnail of document .* does not exist")
def test_thumbnail_no_access(self): def test_thumbnail_no_access(self):
doc = self.make_test_data() doc = self.make_test_data()
os.chmod(doc.thumbnail_path, 0o000) os.chmod(doc.thumbnail_path, 0o000)
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Cannot read thumbnail file of document")
os.chmod(doc.thumbnail_path, 0o777) os.chmod(doc.thumbnail_path, 0o777)
def test_no_original(self): def test_no_original(self):
doc = self.make_test_data() doc = self.make_test_data()
os.remove(doc.source_path) os.remove(doc.source_path)
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Original of document .* does not exist.")
def test_original_no_access(self): def test_original_no_access(self):
doc = self.make_test_data() doc = self.make_test_data()
os.chmod(doc.source_path, 0o000) os.chmod(doc.source_path, 0o000)
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Cannot read original file of document")
os.chmod(doc.source_path, 0o777) os.chmod(doc.source_path, 0o777)
def test_original_checksum_mismatch(self): def test_original_checksum_mismatch(self):
doc = self.make_test_data() doc = self.make_test_data()
doc.checksum = "WOW" doc.checksum = "WOW"
doc.save() doc.save()
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Checksum mismatch of document")
def test_no_archive(self): def test_no_archive(self):
doc = self.make_test_data() doc = self.make_test_data()
os.remove(doc.archive_path) os.remove(doc.archive_path)
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Archived version of document .* does not exist.")
def test_archive_no_access(self): def test_archive_no_access(self):
doc = self.make_test_data() doc = self.make_test_data()
os.chmod(doc.archive_path, 0o000) os.chmod(doc.archive_path, 0o000)
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Cannot read archive file of document")
os.chmod(doc.archive_path, 0o777) os.chmod(doc.archive_path, 0o777)
def test_archive_checksum_mismatch(self): def test_archive_checksum_mismatch(self):
doc = self.make_test_data() doc = self.make_test_data()
doc.archive_checksum = "WOW" doc.archive_checksum = "WOW"
doc.save() doc.save()
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Checksum mismatch of archived document")
def test_empty_content(self): def test_empty_content(self):
doc = self.make_test_data() doc = self.make_test_data()
doc.content = "" doc.content = ""
doc.save() doc.save()
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertFalse(messages.has_error())
self.assertFalse(messages.has_warning())
self.assertEqual(len(messages), 1)
self.assertRegex(messages[0]['message'], "Document .* has no content.")
def test_orphaned_file(self): def test_orphaned_file(self):
doc = self.make_test_data() doc = self.make_test_data()
Path(self.dirs.originals_dir, "orphaned").touch() Path(self.dirs.originals_dir, "orphaned").touch()
self.assertEqual(len(check_sanity()), 1) messages = check_sanity()
self.assertFalse(messages.has_error())
def test_error_tostring(self): self.assertTrue(messages.has_warning())
Document.objects.create(title="test", checksum="dgfhj", archive_checksum="dfhg", content="", pk=1, filename="0000001.pdf", archive_filename="0000001.pdf") self.assertEqual(len(messages), 1)
string = str(SanityFailedError(check_sanity())) self.assertRegex(messages[0]['message'], "Orphaned file in media dir")
self.assertIsNotNone(string)
def test_archive_filename_no_checksum(self): def test_archive_filename_no_checksum(self):
doc = self.make_test_data() doc = self.make_test_data()
doc.archive_checksum = None doc.archive_checksum = None
doc.save() doc.save()
self.assertEqual(len(check_sanity()), 2) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 2)
self.assertRegex(messages[0]['message'], "has an archive file, but its checksum is missing.")
def test_archive_checksum_no_filename(self): def test_archive_checksum_no_filename(self):
doc = self.make_test_data() doc = self.make_test_data()
doc.archive_filename = None doc.archive_filename = None
doc.save() doc.save()
self.assertEqual(len(check_sanity()), 2) messages = check_sanity()
self.assertTrue(messages.has_error())
self.assertEqual(len(messages), 2)
self.assertRegex(messages[0]['message'], "has an archive file checksum, but no archive filename.")

View File

@ -2,12 +2,12 @@ import os
from unittest import mock from unittest import mock
from django.conf import settings from django.conf import settings
from django.test import TestCase, override_settings from django.test import TestCase
from django.utils import timezone from django.utils import timezone
from documents import tasks from documents import tasks
from documents.models import Document, Tag, Correspondent, DocumentType from documents.models import Document, Tag, Correspondent, DocumentType
from documents.sanity_checker import SanityError, SanityFailedError from documents.sanity_checker import SanityCheckMessages, SanityCheckFailedException
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
@ -74,13 +74,33 @@ class TestTasks(DirectoriesMixin, TestCase):
self.assertNotEqual(mtime2, mtime3) self.assertNotEqual(mtime2, mtime3)
@mock.patch("documents.tasks.sanity_checker.check_sanity") @mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check(self, m): def test_sanity_check_success(self, m):
m.return_value = [] m.return_value = SanityCheckMessages()
tasks.sanity_check() self.assertEqual(tasks.sanity_check(), "No issues detected.")
m.assert_called_once() m.assert_called_once()
m.reset_mock()
m.return_value = [SanityError("")] @mock.patch("documents.tasks.sanity_checker.check_sanity")
self.assertRaises(SanityFailedError, tasks.sanity_check) def test_sanity_check_error(self, m):
messages = SanityCheckMessages()
messages.error("Some error")
m.return_value = messages
self.assertRaises(SanityCheckFailedException, tasks.sanity_check)
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_warning(self, m):
messages = SanityCheckMessages()
messages.warning("Some warning")
m.return_value = messages
self.assertEqual(tasks.sanity_check(), "Sanity check exited with warnings. See log.")
m.assert_called_once()
@mock.patch("documents.tasks.sanity_checker.check_sanity")
def test_sanity_check_info(self, m):
messages = SanityCheckMessages()
messages.info("Some info")
m.return_value = messages
self.assertEqual(tasks.sanity_check(), "Sanity check exited with infos. See log.")
m.assert_called_once() m.assert_called_once()
def test_bulk_update_documents(self): def test_bulk_update_documents(self):