From 2853545b9d132af97cea6dfae8ec195a5bc10c0c Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Sun, 3 Apr 2016 16:34:09 +0100 Subject: [PATCH] Added a checksum property and a couple indexes --- .../migrations/0014_document_checksum.py | 167 ++++++++++++++++++ src/documents/models.py | 16 +- 2 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 src/documents/migrations/0014_document_checksum.py diff --git a/src/documents/migrations/0014_document_checksum.py b/src/documents/migrations/0014_document_checksum.py new file mode 100644 index 000000000..167245dea --- /dev/null +++ b/src/documents/migrations/0014_document_checksum.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.4 on 2016-03-28 19:09 +from __future__ import unicode_literals + +import gnupg +import hashlib +import os + +import django.utils.timezone +from django.conf import settings +from django.db import migrations, models +from django.template.defaultfilters import slugify +from django.utils.termcolors import colorize as colourise # Spelling hurts me + + +class GnuPG(object): + """ + A handy singleton to use when handling encrypted files. + """ + + gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME) + + @classmethod + def decrypted(cls, file_handle): + return cls.gpg.decrypt_file( + file_handle, passphrase=settings.PASSPHRASE).data + + @classmethod + def encrypted(cls, file_handle): + return cls.gpg.encrypt_file( + file_handle, + recipients=None, + passphrase=settings.PASSPHRASE, + symmetric=True + ).data + + +class Document(object): + """ + Django's migrations restrict access to model methods, so this is a snapshot + of the methods that existed at the time this migration was written, since + we need to make use of a lot of these shortcuts here. + """ + + def __init__(self, doc): + self.pk = doc.pk + self.correspondent = doc.correspondent + self.title = doc.title + self.file_type = doc.file_type + self.tags = doc.tags + self.created = doc.created + + def __str__(self): + created = self.created.strftime("%Y%m%d%H%M%S") + if self.correspondent and self.title: + return "{}: {} - {}".format( + created, self.correspondent, self.title) + if self.correspondent or self.title: + return "{}: {}".format(created, self.correspondent or self.title) + return str(created) + + @property + def source_path(self): + return os.path.join( + settings.MEDIA_ROOT, + "documents", + "originals", + "{:07}.{}.gpg".format(self.pk, self.file_type) + ) + + @property + def source_file(self): + return open(self.source_path, "rb") + + @property + def file_name(self): + return slugify(str(self)) + "." + self.file_type + + +def set_checksums(apps, schema_editor): + + document_model = apps.get_model("documents", "Document") + + if not document_model.objects.all().exists(): + return + + print(colourise( + "\n\n" + " This is a one-time only migration to generate checksums for all\n" + " of your existing documents. If you have a lot of documents\n" + " though, this may take a while, so a coffee break may be in\n" + " order." + "\n", opts=("bold",) + )) + + sums = {} + for d in document_model.objects.all(): + + document = Document(d) + + print(" {} {} {}".format( + colourise("*", fg="green"), + colourise("Generating a checksum for", fg="white"), + colourise(document.file_name, fg="cyan") + )) + + with document.source_file as encrypted: + checksum = hashlib.md5(GnuPG.decrypted(encrypted)).hexdigest() + + if checksum in sums: + error = "\n{line}{p1}\n\n{doc1}\n{doc2}\n\n{p2}\n\n{code}\n\n{p3}{line}".format( + p1=colourise("It appears that you have two identical documents in your collection and \nPaperless no longer supports this (see issue #97). The documents in question\nare:", fg="yellow"), + p2=colourise("To fix this problem, you'll have to remove one of them from the database, a task\nmost easily done by running the following command in the same\ndirectory as manage.py:", fg="yellow"), + p3=colourise("When that's finished, re-run the migrate, and provided that there aren't any\nother duplicates, you should be good to go.", fg="yellow"), + doc1=colourise(" * {} (id: {})".format(sums[checksum][1], sums[checksum][0]), fg="red"), + doc2=colourise(" * {} (id: {})".format(document.file_name, document.pk), fg="red"), + code=colourise(" $ echo 'DELETE FROM documents_document WHERE id = {pk};' | ./manage.py dbshell".format(pk=document.pk), fg="green"), + line=colourise("\n{}\n".format("=" * 80), fg="white", opts=("bold",)) + ) + raise RuntimeError(error) + sums[checksum] = (document.pk, document.file_name) + + document_model.objects.filter(pk=document.pk).update(checksum=checksum) + + +def do_nothing(apps, schema_editor): + pass + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0013_auto_20160325_2111'), + ] + + operations = [ + migrations.AddField( + model_name='document', + name='checksum', + field=models.CharField( + default='-', + db_index=True, + editable=False, + max_length=32, + help_text='The checksum of the original document (before it ' + 'was encrypted). We use this to prevent duplicate ' + 'document imports.', + ), + preserve_default=False, + ), + migrations.RunPython(set_checksums, do_nothing), + migrations.AlterField( + model_name='document', + name='created', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='document', + name='modified', + field=models.DateTimeField(auto_now=True, db_index=True), + ), + migrations.AlterField( + model_name='document', + name='checksum', + field=models.CharField(editable=False, help_text='The checksum of the original document (before it was encrypted). We use this to prevent duplicate document imports.', max_length=32, unique=True), + ), + ] diff --git a/src/documents/models.py b/src/documents/models.py index 425ca4c77..6dd2cbb63 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -156,8 +156,20 @@ class Document(models.Model): ) tags = models.ManyToManyField( Tag, related_name="documents", blank=True) - created = models.DateTimeField(default=timezone.now) - modified = models.DateTimeField(auto_now=True, editable=False) + + checksum = models.CharField( + max_length=32, + editable=False, + unique=True, + help_text="The checksum of the original document (before it was " + "encrypted). We use this to prevent duplicate document " + "imports." + ) + + created = models.DateTimeField( + default=timezone.now, db_index=True) + modified = models.DateTimeField( + auto_now=True, editable=False, db_index=True) class Meta(object): ordering = ("correspondent", "title")