diff --git a/src/documents/migrations/1004_sanity_check_schedule.py b/src/documents/migrations/1004_sanity_check_schedule.py new file mode 100644 index 000000000..b6346d479 --- /dev/null +++ b/src/documents/migrations/1004_sanity_check_schedule.py @@ -0,0 +1,26 @@ +# Generated by Django 3.1.3 on 2020-11-25 14:53 + +from django.db import migrations +from django.db.migrations import RunPython +from django_q.models import Schedule +from django_q.tasks import schedule + + +def add_schedules(apps, schema_editor): + schedule('documents.tasks.sanity_check', name="Perform sanity check", schedule_type=Schedule.WEEKLY) + + +def remove_schedules(apps, schema_editor): + Schedule.objects.filter(func='documents.tasks.sanity_check').delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1003_mime_types'), + ('django_q', '0013_task_attempt_count'), + ] + + operations = [ + RunPython(add_schedules, remove_schedules) + ] diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py new file mode 100644 index 000000000..18bb3781c --- /dev/null +++ b/src/documents/sanity_checker.py @@ -0,0 +1,94 @@ +import hashlib +import os + +from django.conf import settings + +from documents.models import Document + + +class SanityMessage: + message = None + + +class SanityWarning(SanityMessage): + def __init__(self, message): + self.message = message + + def __str__(self): + return f"Warning: {self.message}" + + +class SanityError(SanityMessage): + def __init__(self, message): + self.message = message + + def __str__(self): + return f"ERROR: {self.message}" + + +class SanityFailedError(Exception): + + def __init__(self, messages): + self.messages = messages + + def __str__(self): + message_string = "\n".join([str(m) for m in self.messages]) + return ( + f"The following issuse were found by the sanity checker:\n" + f"{message_string}\n\n===============\n\n") + + +def check_sanity(): + messages = [] + + present_files = [] + for root, subdirs, files in os.walk(settings.MEDIA_ROOT): + for f in files: + present_files.append(os.path.normpath(os.path.join(root, f))) + + for doc in Document.objects.all(): + # Check thumbnail + if not os.path.isfile(doc.thumbnail_path): + messages.append(SanityError( + f"Thumbnail of document {doc.pk} does not exist.")) + else: + present_files.remove(os.path.normpath(doc.thumbnail_path)) + try: + with doc.thumbnail_file as f: + f.read() + except OSError as e: + messages.append(SanityError( + f"Cannot read thumbnail file of document {doc.pk}: {e}" + )) + + # Check document + if not os.path.isfile(doc.source_path): + messages.append(SanityError( + f"Original of document {doc.pk} does not exist.")) + else: + present_files.remove(os.path.normpath(doc.source_path)) + checksum = None + try: + with doc.source_file as f: + checksum = hashlib.md5(f.read()).hexdigest() + except OSError as e: + messages.append(SanityError( + f"Cannot read original file of document {doc.pk}: {e}")) + + if checksum and not checksum == doc.checksum: + messages.append(SanityError( + f"Checksum mismatch of document {doc.pk}. " + f"Stored: {doc.checksum}, actual: {checksum}." + )) + + if not doc.content: + messages.append(SanityWarning( + f"Document {doc.pk} has no content." + )) + + for extra_file in present_files: + messages.append(SanityWarning( + f"Orphaned file in media dir: {extra_file}" + )) + + return messages diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 40ed8f25e..3c9baad08 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -3,11 +3,12 @@ import logging from django.conf import settings from whoosh.writing import AsyncWriter -from documents import index +from documents import index, sanity_checker from documents.classifier import DocumentClassifier, \ IncompatibleClassifierVersionError from documents.consumer import Consumer, ConsumerError from documents.models import Document +from documents.sanity_checker import SanityFailedError def index_optimize(): @@ -74,3 +75,12 @@ def consume_file(path, else: raise ConsumerError("Unknown error: Returned document was null, but " "no error message was given.") + + +def sanity_check(): + messages = sanity_checker.check_sanity() + + if len(messages) > 0: + raise SanityFailedError(messages) + else: + return "No issues detected."