diff --git a/src/documents/consumer.py b/src/documents/consumer.py index b2e8e1614..e8f527974 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -193,6 +193,8 @@ class Consumer(LoggingMixin): # After everything is in the database, copy the files into # place. If this fails, we'll also rollback the transaction. + # TODO: not required, since this is done by the file handling + # logic create_source_path_directory(document.source_path) self._write(document.storage_type, diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index 179c97492..1f6d59c4e 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -11,10 +11,13 @@ def create_source_path_directory(source_path): # TODO: also make this work for archive dir -def delete_empty_directories(directory): +def delete_empty_directories(directory, root): + if not os.path.isdir(directory): + return + # Go up in the directory hierarchy and try to delete all directories directory = os.path.normpath(directory) - root = os.path.normpath(settings.ORIGINALS_DIR) + root = os.path.normpath(root) if not directory.startswith(root + os.path.sep): # don't do anything outside our originals folder. @@ -102,3 +105,8 @@ def generate_filename(doc): filename += ".gpg" return filename + + +def archive_name_from_filename(filename): + + return os.path.splitext(filename)[0] + ".pdf" diff --git a/src/documents/models.py b/src/documents/models.py index d1a88043d..a4f887d77 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -11,6 +11,7 @@ from django.db import models from django.utils import timezone from django.utils.text import slugify +from documents.file_handling import archive_name_from_filename from documents.parsers import get_default_file_extension @@ -233,9 +234,10 @@ class Document(models.Model): @property def archive_path(self): - fname = "{:07}{}".format(self.pk, ".pdf") - if self.storage_type == self.STORAGE_TYPE_GPG: - fname += ".gpg" + if self.filename: + fname = archive_name_from_filename(self.filename) + else: + fname = "{:07}.pdf".format(self.pk) return os.path.join( settings.ARCHIVE_DIR, diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 435143792..9bb022e3f 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -13,7 +13,7 @@ from rest_framework.reverse import reverse from .. import index, matching from ..file_handling import delete_empty_directories, generate_filename, \ - create_source_path_directory + create_source_path_directory, archive_name_from_filename from ..models import Document, Tag @@ -175,13 +175,40 @@ def cleanup_document_deletion(sender, instance, using, **kwargs): if os.path.isfile(f): try: os.unlink(f) + logging.getLogger(__name__).debug( + f"Deleted file {f}.") except OSError as e: logging.getLogger(__name__).warning( f"While deleting document {instance.file_name}, the file " f"{f} could not be deleted: {e}" ) - delete_empty_directories(os.path.dirname(instance.source_path)) + delete_empty_directories( + os.path.dirname(instance.source_path), + root=settings.ORIGINALS_DIR + ) + + delete_empty_directories( + os.path.dirname(instance.archive_path), + root=settings.ARCHIVE_DIR + ) + + +def validate_move(instance, old_path, new_path): + if not os.path.isfile(old_path): + # Can't do anything if the old file does not exist anymore. + logging.getLogger(__name__).fatal( + f"Document {str(instance)}: File {old_path} has gone.") + return False + + if os.path.isfile(new_path): + # Can't do anything if the new file already exists. Skip updating file. + logging.getLogger(__name__).warning( + f"Document {str(instance)}: Cannot rename file " + f"since target path {new_path} already exists.") + return False + + return True @receiver(models.signals.m2m_changed, sender=Document.tags.through) @@ -189,55 +216,90 @@ def cleanup_document_deletion(sender, instance, using, **kwargs): def update_filename_and_move_files(sender, instance, **kwargs): if not instance.filename: - # Can't update the filename if there is not filename to begin with - # This happens after the consumer creates a new document. - # The PK needs to be set first by saving the document once. When this - # happens, the file is not yet in the ORIGINALS_DIR, and thus can't be - # renamed anyway. In all other cases, instance.filename will be set. + # Can't update the filename if there is no filename to begin with + # This happens when the consumer creates a new document. + # The document is modified and saved multiple times, and only after + # everything is done (i.e., the generated filename is final), + # filename will be set to the location where the consumer has put + # the file. + # + # This will in turn cause this logic to move the file where it belongs. return old_filename = instance.filename - old_path = instance.source_path new_filename = generate_filename(instance) if new_filename == instance.filename: # Don't do anything if its the same. return - new_path = os.path.join(settings.ORIGINALS_DIR, new_filename) + old_source_path = instance.source_path + new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename) - if not os.path.isfile(old_path): - # Can't do anything if the old file does not exist anymore. - logging.getLogger(__name__).fatal( - f"Document {str(instance)}: File {old_path} has gone.") + if not validate_move(instance, old_source_path, new_source_path): return - if os.path.isfile(new_path): - # Can't do anything if the new file already exists. Skip updating file. - logging.getLogger(__name__).warning( - f"Document {str(instance)}: Cannot rename file " - f"since target path {new_path} already exists.") - return + # archive files are optional, archive checksum tells us if we have one, + # since this is None for documents without archived files. + if instance.archive_checksum: + new_archive_filename = archive_name_from_filename(new_filename) + old_archive_path = instance.archive_path + new_archive_path = os.path.join(settings.ARCHIVE_DIR, + new_archive_filename) - create_source_path_directory(new_path) + if not validate_move(instance, old_archive_path, new_archive_path): + return + + create_source_path_directory(new_archive_path) + else: + old_archive_path = None + new_archive_path = None + + create_source_path_directory(new_source_path) try: - os.rename(old_path, new_path) + os.rename(old_source_path, new_source_path) + if instance.archive_checksum: + os.rename(old_archive_path, new_archive_path) instance.filename = new_filename # Don't save here to prevent infinite recursion. Document.objects.filter(pk=instance.pk).update(filename=new_filename) logging.getLogger(__name__).debug( - f"Moved file {old_path} to {new_path}.") + f"Moved file {old_source_path} to {new_source_path}.") + + logging.getLogger(__name__).debug( + f"Moved file {old_archive_path} to {new_archive_path}.") except OSError as e: instance.filename = old_filename + # this happens when we can't move a file. If that's the case for the + # archive file, we try our best to revert the changes. + try: + os.rename(new_source_path, old_source_path) + os.rename(new_archive_path, old_archive_path) + except: + # This is fine, since: + # A: if we managed to move source from A to B, we will also manage + # to move it from B to A. If not, we have a serious issue + # that's going to get caught by the santiy checker. + # all files remain in place and will never be overwritten, + # so this is not the end of the world. + # B: if moving the orignal file failed, nothing has changed anyway. + pass except DatabaseError as e: - os.rename(new_path, old_path) + os.rename(new_source_path, old_source_path) + if instance.archive_checksum: + os.rename(new_archive_path, old_archive_path) instance.filename = old_filename - if not os.path.isfile(old_path): - delete_empty_directories(os.path.dirname(old_path)) + if not os.path.isfile(old_source_path): + delete_empty_directories(os.path.dirname(old_source_path), + root=settings.ORIGINALS_DIR) + + if old_archive_path and not os.path.isfile(old_archive_path): + delete_empty_directories(os.path.dirname(old_archive_path), + root=settings.ARCHIVE_DIR) def set_log_entry(sender, document=None, logging_group=None, **kwargs): diff --git a/src/documents/tasks.py b/src/documents/tasks.py index cd47892be..65d767efc 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -13,8 +13,8 @@ from documents.sanity_checker import SanityFailedError def index_optimize(): ix = index.open_index() - with AsyncWriter(ix) as writer: - writer.commit(optimize=True) + writer = AsyncWriter(ix) + writer.commit(optimize=True) def index_reindex(): diff --git a/src/documents/tests/test_api.py b/src/documents/tests/test_api.py index df5775145..7f57d090b 100644 --- a/src/documents/tests/test_api.py +++ b/src/documents/tests/test_api.py @@ -110,10 +110,12 @@ class DocumentApiTest(DirectoriesMixin, APITestCase): with open(filename, "wb") as f: f.write(content) - doc = Document.objects.create(title="none", filename=os.path.basename(filename), + filename = os.path.basename(filename) + + doc = Document.objects.create(title="none", filename=filename, mime_type="application/pdf") - with open(os.path.join(self.dirs.archive_dir, "{:07d}.pdf".format(doc.pk)), "wb") as f: + with open(doc.archive_path, "wb") as f: f.write(content_archive) response = self.client.get('/api/documents/{}/download/'.format(doc.pk)) diff --git a/src/documents/tests/test_file_handling.py b/src/documents/tests/test_file_handling.py index d799384e7..71f6cc4db 100644 --- a/src/documents/tests/test_file_handling.py +++ b/src/documents/tests/test_file_handling.py @@ -321,7 +321,7 @@ class TestDate(TestCase): Path(os.path.join(tmp, "notempty", "file")).touch() os.makedirs(os.path.join(tmp, "notempty", "empty")) - delete_empty_directories(os.path.join(tmp, "notempty", "empty")) + delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR) self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) self.assertEqual(os.path.isfile( os.path.join(tmp, "notempty", "file")), True)