filename handling for archive files.

This commit is contained in:
jonaswinkler 2020-11-30 21:38:21 +01:00
parent aaa6599283
commit 8a5c782425
7 changed files with 111 additions and 35 deletions

View File

@ -193,6 +193,8 @@ class Consumer(LoggingMixin):
# After everything is in the database, copy the files into # After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction. # place. If this fails, we'll also rollback the transaction.
# TODO: not required, since this is done by the file handling
# logic
create_source_path_directory(document.source_path) create_source_path_directory(document.source_path)
self._write(document.storage_type, self._write(document.storage_type,

View File

@ -11,10 +11,13 @@ def create_source_path_directory(source_path):
# TODO: also make this work for archive dir # TODO: also make this work for archive dir
def delete_empty_directories(directory): def delete_empty_directories(directory, root):
if not os.path.isdir(directory):
return
# Go up in the directory hierarchy and try to delete all directories # Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory) directory = os.path.normpath(directory)
root = os.path.normpath(settings.ORIGINALS_DIR) root = os.path.normpath(root)
if not directory.startswith(root + os.path.sep): if not directory.startswith(root + os.path.sep):
# don't do anything outside our originals folder. # don't do anything outside our originals folder.
@ -102,3 +105,8 @@ def generate_filename(doc):
filename += ".gpg" filename += ".gpg"
return filename return filename
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"

View File

@ -11,6 +11,7 @@ from django.db import models
from django.utils import timezone from django.utils import timezone
from django.utils.text import slugify from django.utils.text import slugify
from documents.file_handling import archive_name_from_filename
from documents.parsers import get_default_file_extension from documents.parsers import get_default_file_extension
@ -233,9 +234,10 @@ class Document(models.Model):
@property @property
def archive_path(self): def archive_path(self):
fname = "{:07}{}".format(self.pk, ".pdf") if self.filename:
if self.storage_type == self.STORAGE_TYPE_GPG: fname = archive_name_from_filename(self.filename)
fname += ".gpg" else:
fname = "{:07}.pdf".format(self.pk)
return os.path.join( return os.path.join(
settings.ARCHIVE_DIR, settings.ARCHIVE_DIR,

View File

@ -13,7 +13,7 @@ from rest_framework.reverse import reverse
from .. import index, matching from .. import index, matching
from ..file_handling import delete_empty_directories, generate_filename, \ from ..file_handling import delete_empty_directories, generate_filename, \
create_source_path_directory create_source_path_directory, archive_name_from_filename
from ..models import Document, Tag from ..models import Document, Tag
@ -175,13 +175,40 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
if os.path.isfile(f): if os.path.isfile(f):
try: try:
os.unlink(f) os.unlink(f)
logging.getLogger(__name__).debug(
f"Deleted file {f}.")
except OSError as e: except OSError as e:
logging.getLogger(__name__).warning( logging.getLogger(__name__).warning(
f"While deleting document {instance.file_name}, the file " f"While deleting document {instance.file_name}, the file "
f"{f} could not be deleted: {e}" f"{f} could not be deleted: {e}"
) )
delete_empty_directories(os.path.dirname(instance.source_path)) delete_empty_directories(
os.path.dirname(instance.source_path),
root=settings.ORIGINALS_DIR
)
delete_empty_directories(
os.path.dirname(instance.archive_path),
root=settings.ARCHIVE_DIR
)
def validate_move(instance, old_path, new_path):
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
logging.getLogger(__name__).fatal(
f"Document {str(instance)}: File {old_path} has gone.")
return False
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
logging.getLogger(__name__).warning(
f"Document {str(instance)}: Cannot rename file "
f"since target path {new_path} already exists.")
return False
return True
@receiver(models.signals.m2m_changed, sender=Document.tags.through) @receiver(models.signals.m2m_changed, sender=Document.tags.through)
@ -189,55 +216,90 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
def update_filename_and_move_files(sender, instance, **kwargs): def update_filename_and_move_files(sender, instance, **kwargs):
if not instance.filename: if not instance.filename:
# Can't update the filename if there is not filename to begin with # Can't update the filename if there is no filename to begin with
# This happens after the consumer creates a new document. # This happens when the consumer creates a new document.
# The PK needs to be set first by saving the document once. When this # The document is modified and saved multiple times, and only after
# happens, the file is not yet in the ORIGINALS_DIR, and thus can't be # everything is done (i.e., the generated filename is final),
# renamed anyway. In all other cases, instance.filename will be set. # filename will be set to the location where the consumer has put
# the file.
#
# This will in turn cause this logic to move the file where it belongs.
return return
old_filename = instance.filename old_filename = instance.filename
old_path = instance.source_path
new_filename = generate_filename(instance) new_filename = generate_filename(instance)
if new_filename == instance.filename: if new_filename == instance.filename:
# Don't do anything if its the same. # Don't do anything if its the same.
return return
new_path = os.path.join(settings.ORIGINALS_DIR, new_filename) old_source_path = instance.source_path
new_source_path = os.path.join(settings.ORIGINALS_DIR, new_filename)
if not os.path.isfile(old_path): if not validate_move(instance, old_source_path, new_source_path):
# Can't do anything if the old file does not exist anymore.
logging.getLogger(__name__).fatal(
f"Document {str(instance)}: File {old_path} has gone.")
return return
if os.path.isfile(new_path): # archive files are optional, archive checksum tells us if we have one,
# Can't do anything if the new file already exists. Skip updating file. # since this is None for documents without archived files.
logging.getLogger(__name__).warning( if instance.archive_checksum:
f"Document {str(instance)}: Cannot rename file " new_archive_filename = archive_name_from_filename(new_filename)
f"since target path {new_path} already exists.") old_archive_path = instance.archive_path
return new_archive_path = os.path.join(settings.ARCHIVE_DIR,
new_archive_filename)
create_source_path_directory(new_path) if not validate_move(instance, old_archive_path, new_archive_path):
return
create_source_path_directory(new_archive_path)
else:
old_archive_path = None
new_archive_path = None
create_source_path_directory(new_source_path)
try: try:
os.rename(old_path, new_path) os.rename(old_source_path, new_source_path)
if instance.archive_checksum:
os.rename(old_archive_path, new_archive_path)
instance.filename = new_filename instance.filename = new_filename
# Don't save here to prevent infinite recursion. # Don't save here to prevent infinite recursion.
Document.objects.filter(pk=instance.pk).update(filename=new_filename) Document.objects.filter(pk=instance.pk).update(filename=new_filename)
logging.getLogger(__name__).debug( logging.getLogger(__name__).debug(
f"Moved file {old_path} to {new_path}.") f"Moved file {old_source_path} to {new_source_path}.")
logging.getLogger(__name__).debug(
f"Moved file {old_archive_path} to {new_archive_path}.")
except OSError as e: except OSError as e:
instance.filename = old_filename instance.filename = old_filename
# this happens when we can't move a file. If that's the case for the
# archive file, we try our best to revert the changes.
try:
os.rename(new_source_path, old_source_path)
os.rename(new_archive_path, old_archive_path)
except:
# This is fine, since:
# A: if we managed to move source from A to B, we will also manage
# to move it from B to A. If not, we have a serious issue
# that's going to get caught by the santiy checker.
# all files remain in place and will never be overwritten,
# so this is not the end of the world.
# B: if moving the orignal file failed, nothing has changed anyway.
pass
except DatabaseError as e: except DatabaseError as e:
os.rename(new_path, old_path) os.rename(new_source_path, old_source_path)
if instance.archive_checksum:
os.rename(new_archive_path, old_archive_path)
instance.filename = old_filename instance.filename = old_filename
if not os.path.isfile(old_path): if not os.path.isfile(old_source_path):
delete_empty_directories(os.path.dirname(old_path)) delete_empty_directories(os.path.dirname(old_source_path),
root=settings.ORIGINALS_DIR)
if old_archive_path and not os.path.isfile(old_archive_path):
delete_empty_directories(os.path.dirname(old_archive_path),
root=settings.ARCHIVE_DIR)
def set_log_entry(sender, document=None, logging_group=None, **kwargs): def set_log_entry(sender, document=None, logging_group=None, **kwargs):

View File

@ -13,8 +13,8 @@ from documents.sanity_checker import SanityFailedError
def index_optimize(): def index_optimize():
ix = index.open_index() ix = index.open_index()
with AsyncWriter(ix) as writer: writer = AsyncWriter(ix)
writer.commit(optimize=True) writer.commit(optimize=True)
def index_reindex(): def index_reindex():

View File

@ -110,10 +110,12 @@ class DocumentApiTest(DirectoriesMixin, APITestCase):
with open(filename, "wb") as f: with open(filename, "wb") as f:
f.write(content) f.write(content)
doc = Document.objects.create(title="none", filename=os.path.basename(filename), filename = os.path.basename(filename)
doc = Document.objects.create(title="none", filename=filename,
mime_type="application/pdf") mime_type="application/pdf")
with open(os.path.join(self.dirs.archive_dir, "{:07d}.pdf".format(doc.pk)), "wb") as f: with open(doc.archive_path, "wb") as f:
f.write(content_archive) f.write(content_archive)
response = self.client.get('/api/documents/{}/download/'.format(doc.pk)) response = self.client.get('/api/documents/{}/download/'.format(doc.pk))

View File

@ -321,7 +321,7 @@ class TestDate(TestCase):
Path(os.path.join(tmp, "notempty", "file")).touch() Path(os.path.join(tmp, "notempty", "file")).touch()
os.makedirs(os.path.join(tmp, "notempty", "empty")) os.makedirs(os.path.join(tmp, "notempty", "empty"))
delete_empty_directories(os.path.join(tmp, "notempty", "empty")) delete_empty_directories(os.path.join(tmp, "notempty", "empty"), root=settings.ORIGINALS_DIR)
self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True) self.assertEqual(os.path.isdir(os.path.join(tmp, "notempty")), True)
self.assertEqual(os.path.isfile( self.assertEqual(os.path.isfile(
os.path.join(tmp, "notempty", "file")), True) os.path.join(tmp, "notempty", "file")), True)