added a simple sanity checker.

This commit is contained in:
Jonas Winkler 2020-11-25 16:04:58 +01:00
parent 6aca09d485
commit 751c2ac54b
3 changed files with 131 additions and 1 deletions

View File

@ -0,0 +1,26 @@
# Generated by Django 3.1.3 on 2020-11-25 14:53
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func='documents.tasks.sanity_check').delete()
class Migration(migrations.Migration):
dependencies = [
('documents', '1003_mime_types'),
('django_q', '0013_task_attempt_count'),
]
operations = [
RunPython(add_schedules, remove_schedules)
]

View File

@ -0,0 +1,94 @@
import hashlib
import os
from django.conf import settings
from documents.models import Document
class SanityMessage:
message = None
class SanityWarning(SanityMessage):
def __init__(self, message):
self.message = message
def __str__(self):
return f"Warning: {self.message}"
class SanityError(SanityMessage):
def __init__(self, message):
self.message = message
def __str__(self):
return f"ERROR: {self.message}"
class SanityFailedError(Exception):
def __init__(self, messages):
self.messages = messages
def __str__(self):
message_string = "\n".join([str(m) for m in self.messages])
return (
f"The following issuse were found by the sanity checker:\n"
f"{message_string}\n\n===============\n\n")
def check_sanity():
messages = []
present_files = []
for root, subdirs, files in os.walk(settings.MEDIA_ROOT):
for f in files:
present_files.append(os.path.normpath(os.path.join(root, f)))
for doc in Document.objects.all():
# Check thumbnail
if not os.path.isfile(doc.thumbnail_path):
messages.append(SanityError(
f"Thumbnail of document {doc.pk} does not exist."))
else:
present_files.remove(os.path.normpath(doc.thumbnail_path))
try:
with doc.thumbnail_file as f:
f.read()
except OSError as e:
messages.append(SanityError(
f"Cannot read thumbnail file of document {doc.pk}: {e}"
))
# Check document
if not os.path.isfile(doc.source_path):
messages.append(SanityError(
f"Original of document {doc.pk} does not exist."))
else:
present_files.remove(os.path.normpath(doc.source_path))
checksum = None
try:
with doc.source_file as f:
checksum = hashlib.md5(f.read()).hexdigest()
except OSError as e:
messages.append(SanityError(
f"Cannot read original file of document {doc.pk}: {e}"))
if checksum and not checksum == doc.checksum:
messages.append(SanityError(
f"Checksum mismatch of document {doc.pk}. "
f"Stored: {doc.checksum}, actual: {checksum}."
))
if not doc.content:
messages.append(SanityWarning(
f"Document {doc.pk} has no content."
))
for extra_file in present_files:
messages.append(SanityWarning(
f"Orphaned file in media dir: {extra_file}"
))
return messages

View File

@ -3,11 +3,12 @@ import logging
from django.conf import settings
from whoosh.writing import AsyncWriter
from documents import index
from documents import index, sanity_checker
from documents.classifier import DocumentClassifier, \
IncompatibleClassifierVersionError
from documents.consumer import Consumer, ConsumerError
from documents.models import Document
from documents.sanity_checker import SanityFailedError
def index_optimize():
@ -74,3 +75,12 @@ def consume_file(path,
else:
raise ConsumerError("Unknown error: Returned document was null, but "
"no error message was given.")
def sanity_check():
messages = sanity_checker.check_sanity()
if len(messages) > 0:
raise SanityFailedError(messages)
else:
return "No issues detected."