From fca8576d80cc99312b15b4d512cd528e618039d5 Mon Sep 17 00:00:00 2001 From: jonaswinkler Date: Tue, 9 Feb 2021 19:46:19 +0100 Subject: [PATCH] archive filenames are now stored in the database and checked for collisions just as original filenames as well, unified method for archive version checking --- src/documents/consumer.py | 7 +- src/documents/file_handling.py | 31 +-- .../management/commands/document_archiver.py | 14 +- .../management/commands/document_exporter.py | 2 +- .../migrations/1012_fix_archive_files.py | 177 +++++++++++++----- src/documents/models.py | 33 +++- src/documents/sanity_checker.py | 2 +- src/documents/signals/handlers.py | 56 +++--- src/documents/views.py | 12 +- 9 files changed, 229 insertions(+), 105 deletions(-) diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 3baedb5c3..acb3ad33f 100755 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -292,8 +292,7 @@ class Consumer(LoggingMixin): # After everything is in the database, copy the files into # place. If this fails, we'll also rollback the transaction. with FileLock(settings.MEDIA_LOCK): - document.filename = generate_unique_filename( - document, settings.ORIGINALS_DIR) + document.filename = generate_unique_filename(document) create_source_path_directory(document.source_path) self._write(document.storage_type, @@ -303,6 +302,10 @@ class Consumer(LoggingMixin): thumbnail, document.thumbnail_path) if archive_path and os.path.isfile(archive_path): + document.archive_filename = generate_unique_filename( + document, + archive_filename=True + ) create_source_path_directory(document.archive_path) self._write(document.storage_type, archive_path, document.archive_path) diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index 64858de78..535aa3d2c 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -79,12 +79,20 @@ def many_to_dictionary(field): return mydictionary -def generate_unique_filename(doc, root): +def generate_unique_filename(doc, archive_filename=False): + if archive_filename: + old_filename = doc.archive_filename + root = settings.ARCHIVE_DIR + else: + old_filename = doc.filename + root = settings.ORIGINALS_DIR + counter = 0 while True: - new_filename = generate_filename(doc, counter) - if new_filename == doc.filename: + new_filename = generate_filename( + doc, counter, archive_filename=archive_filename) + if new_filename == old_filename: # still the same as before. return new_filename @@ -94,7 +102,7 @@ def generate_unique_filename(doc, root): return new_filename -def generate_filename(doc, counter=0, append_gpg=True): +def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False): path = "" try: @@ -148,21 +156,16 @@ def generate_filename(doc, counter=0, append_gpg=True): f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default") counter_str = f"_{counter:02}" if counter else "" + + filetype_str = ".pdf" if archive_filename else doc.file_type + if len(path) > 0: - filename = f"{path}{counter_str}{doc.file_type}" + filename = f"{path}{counter_str}{filetype_str}" else: - filename = f"{doc.pk:07}{counter_str}{doc.file_type}" + filename = f"{doc.pk:07}{counter_str}{filetype_str}" # Append .gpg for encrypted files if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG: filename += ".gpg" return filename - - -def archive_name_from_filename(filename): - name, ext = os.path.splitext(filename) - if ext == ".pdf": - return filename - else: - return filename + ".pdf" diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py index d2ff9c8c2..fe8c8b530 100644 --- a/src/documents/management/commands/document_archiver.py +++ b/src/documents/management/commands/document_archiver.py @@ -16,7 +16,8 @@ from whoosh.writing import AsyncWriter from documents.models import Document from ... import index -from ...file_handling import create_source_path_directory +from ...file_handling import create_source_path_directory, \ + generate_unique_filename from ...parsers import get_parser_class_for_mime_type @@ -39,13 +40,16 @@ def handle_document(document_id): with transaction.atomic(): with open(parser.get_archive_path(), 'rb') as f: checksum = hashlib.md5(f.read()).hexdigest() - # i'm going to save first so that in case the file move + # I'm going to save first so that in case the file move # fails, the database is rolled back. - # we also don't use save() since that triggers the filehandling + # We also don't use save() since that triggers the filehandling # logic, and we don't want that yet (file not yet in place) + document.archive_filename = generate_unique_filename( + document, archive_filename=True) Document.objects.filter(pk=document.pk).update( archive_checksum=checksum, - content=parser.get_text() + content=parser.get_text(), + archive_filename=document.archive_filename ) with FileLock(settings.MEDIA_LOCK): create_source_path_directory(document.archive_path) @@ -101,7 +105,7 @@ class Command(BaseCommand): document_ids = list(map( lambda doc: doc.id, filter( - lambda d: overwrite or not d.archive_checksum, + lambda d: overwrite or not d.has_archive_version, documents ) )) diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 1505b0856..d8f5861a5 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -139,7 +139,7 @@ class Command(BaseCommand): thumbnail_target = os.path.join(self.target, thumbnail_name) document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name - if os.path.exists(document.archive_path): + if document.has_archive_version: archive_name = base_name + "-archive.pdf" archive_target = os.path.join(self.target, archive_name) document_dict[EXPORTER_ARCHIVE_NAME] = archive_name diff --git a/src/documents/migrations/1012_fix_archive_files.py b/src/documents/migrations/1012_fix_archive_files.py index f75ae431d..5db503391 100644 --- a/src/documents/migrations/1012_fix_archive_files.py +++ b/src/documents/migrations/1012_fix_archive_files.py @@ -1,43 +1,27 @@ # Generated by Django 3.1.6 on 2021-02-07 22:26 +import datetime import hashlib import logging import os import shutil +import pathvalidate from django.conf import settings -from django.db import migrations +from django.db import migrations, models +from django.template.defaultfilters import slugify +from documents.file_handling import defaultdictNoStr, many_to_dictionary logger = logging.getLogger("paperless.migrations") -def archive_name_from_filename_old(filename): +def archive_name_from_filename(filename): return os.path.splitext(filename)[0] + ".pdf" def archive_path_old(doc): if doc.filename: - fname = archive_name_from_filename_old(doc.filename) - else: - fname = "{:07}.pdf".format(doc.pk) - - return os.path.join( - settings.ARCHIVE_DIR, - fname - ) - - -def archive_name_from_filename_new(filename): - name, ext = os.path.splitext(filename) - if ext == ".pdf": - return filename - else: - return filename + ".pdf" - - -def archive_path_new(doc): - if doc.filename: - fname = archive_name_from_filename_new(doc.filename) + fname = archive_name_from_filename(doc.filename) else: fname = "{:07}.pdf".format(doc.pk) @@ -50,6 +34,16 @@ def archive_path_new(doc): STORAGE_TYPE_GPG = "gpg" +def archive_path_new(doc): + if doc.archive_filename is not None: + return os.path.join( + settings.ARCHIVE_DIR, + str(doc.archive_filename) + ) + else: + return None + + def source_path(doc): if doc.filename: fname = str(doc.filename) @@ -64,6 +58,98 @@ def source_path(doc): ) +def generate_unique_filename(doc, archive_filename=False): + if archive_filename: + old_filename = doc.archive_filename + root = settings.ARCHIVE_DIR + else: + old_filename = doc.filename + root = settings.ORIGINALS_DIR + + counter = 0 + + while True: + new_filename = generate_filename( + doc, counter, archive_filename=archive_filename) + if new_filename == old_filename: + # still the same as before. + return new_filename + + if os.path.exists(os.path.join(root, new_filename)): + counter += 1 + else: + return new_filename + + +def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False): + path = "" + + try: + if settings.PAPERLESS_FILENAME_FORMAT is not None: + tags = defaultdictNoStr(lambda: slugify(None), + many_to_dictionary(doc.tags)) + + tag_list = pathvalidate.sanitize_filename( + ",".join(sorted( + [tag.name for tag in doc.tags.all()] + )), + replacement_text="-" + ) + + if doc.correspondent: + correspondent = pathvalidate.sanitize_filename( + doc.correspondent.name, replacement_text="-" + ) + else: + correspondent = "none" + + if doc.document_type: + document_type = pathvalidate.sanitize_filename( + doc.document_type.name, replacement_text="-" + ) + else: + document_type = "none" + + path = settings.PAPERLESS_FILENAME_FORMAT.format( + title=pathvalidate.sanitize_filename( + doc.title, replacement_text="-"), + correspondent=correspondent, + document_type=document_type, + created=datetime.date.isoformat(doc.created), + created_year=doc.created.year if doc.created else "none", + created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501 + created_day=f"{doc.created.day:02}" if doc.created else "none", + added=datetime.date.isoformat(doc.added), + added_year=doc.added.year if doc.added else "none", + added_month=f"{doc.added.month:02}" if doc.added else "none", + added_day=f"{doc.added.day:02}" if doc.added else "none", + tags=tags, + tag_list=tag_list + ).strip() + + path = path.strip(os.sep) + + except (ValueError, KeyError, IndexError): + logger.warning( + f"Invalid PAPERLESS_FILENAME_FORMAT: " + f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default") + + counter_str = f"_{counter:02}" if counter else "" + + filetype_str = ".pdf" if archive_filename else doc.file_type + + if len(path) > 0: + filename = f"{path}{counter_str}{filetype_str}" + else: + filename = f"{doc.pk:07}{counter_str}{filetype_str}" + + # Append .gpg for encrypted files + if append_gpg and doc.storage_type == STORAGE_TYPE_GPG: + filename += ".gpg" + + return filename + + def move_old_to_new_locations(apps, schema_editor): Document = apps.get_model("documents", "Document") @@ -74,18 +160,12 @@ def move_old_to_new_locations(apps, schema_editor): # check for documents that have incorrect archive versions for doc in Document.objects.filter(archive_checksum__isnull=False): old_path = archive_path_old(doc) - new_path = archive_path_new(doc) if not os.path.isfile(old_path): raise ValueError( f"Archived document of {doc.filename} does not exist at: " f"{old_path}") - if old_path != new_path and os.path.isfile(new_path): - raise ValueError( - f"Need to move {old_path} to {new_path}, but target file " - f"already exists") - if old_path in old_archive_path_to_id: affected_document_ids.add(doc.id) affected_document_ids.add(old_archive_path_to_id[old_path]) @@ -103,22 +183,19 @@ def move_old_to_new_locations(apps, schema_editor): f"document {doc.filename} has an invalid archived document, " f"but no parsers are available. Cannot migrate.") - # move files for doc in Document.objects.filter(archive_checksum__isnull=False): - old_path = archive_path_old(doc) - new_path = archive_path_new(doc) if doc.id in affected_document_ids: + old_path = archive_path_old(doc) # remove affected archive versions if os.path.isfile(old_path): os.unlink(old_path) else: - # move unaffected archive versions - if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path): - logger.debug( - f"Moving {old_path} to {new_path}" - ) - shutil.move(old_path, new_path) + # Set archive path for unaffected files + doc.archive_filename = archive_path_old(doc) + Document.objects.filter(id=doc.id).update( + archive_filename=doc.archive_filename + ) # regenerate archive documents for doc_id in affected_document_ids: @@ -135,14 +212,16 @@ def move_old_to_new_locations(apps, schema_editor): try: parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename)) doc.content = parser.get_text() - if parser.archive_path and os.path.isfile(parser.archive_path): - with open(parser.archive_path, "rb") as f: + + if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()): + doc.archive_filename = generate_unique_filename( + doc, archive_filename=True) + with open(parser.get_archive_path(), "rb") as f: doc.archive_checksum = hashlib.md5(f.read()).hexdigest() - shutil.copy2(parser.archive_path, archive_path_new(doc)) + os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True) + shutil.copy2(parser.get_archive_path(), archive_path_new(doc)) else: doc.archive_checksum = None - if os.path.isfile(archive_path_new(doc)): - os.unlink(archive_path_new(doc)) doc.save() except ParseError: logger.exception( @@ -187,8 +266,18 @@ class Migration(migrations.Migration): ] operations = [ + migrations.AddField( + model_name='document', + name='archive_filename', + field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'), + ), + migrations.AlterField( + model_name='document', + name='filename', + field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'), + ), migrations.RunPython( move_old_to_new_locations, move_new_to_old_locations - ) + ), ] diff --git a/src/documents/models.py b/src/documents/models.py index 86878dd7e..47433724a 100755 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -16,7 +16,6 @@ from django.utils.timezone import is_aware from django.utils.translation import gettext_lazy as _ -from documents.file_handling import archive_name_from_filename from documents.parsers import get_default_file_extension @@ -208,10 +207,21 @@ class Document(models.Model): max_length=1024, editable=False, default=None, + unique=True, null=True, help_text=_("Current filename in storage") ) + archive_filename = models.FilePathField( + _("archive filename"), + max_length=1024, + editable=False, + default=None, + unique=True, + null=True, + help_text=_("Current archive filename in storage") + ) + archive_serial_number = models.IntegerField( _("archive serial number"), blank=True, @@ -256,16 +266,19 @@ class Document(models.Model): return open(self.source_path, "rb") @property - def archive_path(self): - if self.filename: - fname = archive_name_from_filename(self.filename) - else: - fname = "{:07}.pdf".format(self.pk) + def has_archive_version(self): + return self.archive_filename is not None + + @property + def archive_path(self): + if self.has_archive_version: + return os.path.join( + settings.ARCHIVE_DIR, + str(self.archive_filename) + ) + else: + return None - return os.path.join( - settings.ARCHIVE_DIR, - fname - ) @property def archive_file(self): diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py index b8fd73f98..0b385e81b 100644 --- a/src/documents/sanity_checker.py +++ b/src/documents/sanity_checker.py @@ -88,7 +88,7 @@ def check_sanity(): )) # Check sanity of the archive file. - if doc.archive_checksum: + if doc.has_archive_version: if not os.path.isfile(doc.archive_path): messages.append(SanityError( f"Archived version of document {doc.pk} does not exist." diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py index 848bcb900..5d46dc431 100755 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -14,7 +14,7 @@ from filelock import FileLock from .. import index, matching from ..file_handling import delete_empty_directories, \ - create_source_path_directory, archive_name_from_filename, \ + create_source_path_directory, \ generate_unique_filename from ..models import Document, Tag @@ -148,18 +148,18 @@ def set_tags(sender, @receiver(models.signals.post_delete, sender=Document) def cleanup_document_deletion(sender, instance, using, **kwargs): with FileLock(settings.MEDIA_LOCK): - for f in (instance.source_path, - instance.archive_path, - instance.thumbnail_path): - if os.path.isfile(f): + for filename in (instance.source_path, + instance.archive_path, + instance.thumbnail_path): + if filename and os.path.isfile(filename): try: - os.unlink(f) + os.unlink(filename) logger.debug( - f"Deleted file {f}.") + f"Deleted file {filename}.") except OSError as e: logger.warning( f"While deleting document {str(instance)}, the file " - f"{f} could not be deleted: {e}" + f"{filename} could not be deleted: {e}" ) delete_empty_directories( @@ -167,10 +167,11 @@ def cleanup_document_deletion(sender, instance, using, **kwargs): root=settings.ORIGINALS_DIR ) - delete_empty_directories( - os.path.dirname(instance.archive_path), - root=settings.ARCHIVE_DIR - ) + if instance.has_archive_version: + delete_empty_directories( + os.path.dirname(instance.archive_path), + root=settings.ARCHIVE_DIR + ) def validate_move(instance, old_path, new_path): @@ -207,8 +208,7 @@ def update_filename_and_move_files(sender, instance, **kwargs): with FileLock(settings.MEDIA_LOCK): old_filename = instance.filename - new_filename = generate_unique_filename( - instance, settings.ORIGINALS_DIR) + new_filename = generate_unique_filename(instance) if new_filename == instance.filename: # Don't do anything if its the same. @@ -222,8 +222,11 @@ def update_filename_and_move_files(sender, instance, **kwargs): # archive files are optional, archive checksum tells us if we have one, # since this is None for documents without archived files. - if instance.archive_checksum: - new_archive_filename = archive_name_from_filename(new_filename) + if instance.has_archive_version: + old_archive_filename = instance.archive_filename + new_archive_filename = generate_unique_filename( + instance, archive_filename=True + ) old_archive_path = instance.archive_path new_archive_path = os.path.join(settings.ARCHIVE_DIR, new_archive_filename) @@ -233,6 +236,8 @@ def update_filename_and_move_files(sender, instance, **kwargs): create_source_path_directory(new_archive_path) else: + old_archive_filename = None + new_archive_filename = None old_archive_path = None new_archive_path = None @@ -240,22 +245,28 @@ def update_filename_and_move_files(sender, instance, **kwargs): try: os.rename(old_source_path, new_source_path) - if instance.archive_checksum: - os.rename(old_archive_path, new_archive_path) instance.filename = new_filename + if instance.has_archive_version: + os.rename(old_archive_path, new_archive_path) + instance.archive_filename = new_archive_filename + # Don't save() here to prevent infinite recursion. Document.objects.filter(pk=instance.pk).update( - filename=new_filename) + filename=instance.filename, + archive_filename=instance.archive_filename, + ) except OSError as e: instance.filename = old_filename + instance.archive_filename = old_archive_filename # this happens when we can't move a file. If that's the case for # the archive file, we try our best to revert the changes. # no need to save the instance, the update() has not happened yet. try: os.rename(new_source_path, old_source_path) - os.rename(new_archive_path, old_archive_path) + if instance.has_archive_version: + os.rename(new_archive_path, old_archive_path) except Exception as e: # This is fine, since: # A: if we managed to move source from A to B, we will also @@ -271,9 +282,10 @@ def update_filename_and_move_files(sender, instance, **kwargs): # since moving them once succeeded, it's very likely going to # succeed again. os.rename(new_source_path, old_source_path) - if instance.archive_checksum: + if instance.has_archive_version: os.rename(new_archive_path, old_archive_path) instance.filename = old_filename + instance.archive_filename = old_archive_filename # again, no need to save the instance, since the actual update() # operation failed. @@ -283,7 +295,7 @@ def update_filename_and_move_files(sender, instance, **kwargs): delete_empty_directories(os.path.dirname(old_source_path), root=settings.ORIGINALS_DIR) - if old_archive_path and not os.path.isfile(old_archive_path): + if instance.has_archive_version and not os.path.isfile(old_archive_path): # NOQA: E501 delete_empty_directories(os.path.dirname(old_archive_path), root=settings.ARCHIVE_DIR) diff --git a/src/documents/views.py b/src/documents/views.py index 5ab4ca9df..3a840567d 100755 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -192,7 +192,7 @@ class DocumentViewSet(RetrieveModelMixin, def file_response(self, pk, request, disposition): doc = Document.objects.get(id=pk) - if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501 + if not self.original_requested(request) and doc.has_archive_version: # NOQA: E501 file_handle = doc.archive_file filename = doc.get_public_filename(archive=True) mime_type = 'application/pdf' @@ -237,18 +237,18 @@ class DocumentViewSet(RetrieveModelMixin, "original_size": os.stat(doc.source_path).st_size, "original_mime_type": doc.mime_type, "media_filename": doc.filename, - "has_archive_version": os.path.isfile(doc.archive_path), + "has_archive_version": doc.has_archive_version, "original_metadata": self.get_metadata( - doc.source_path, doc.mime_type) + doc.source_path, doc.mime_type), + "archive_checksum": doc.archive_checksum, + "archive_media_filename": doc.archive_filename } - if doc.archive_checksum and os.path.isfile(doc.archive_path): - meta['archive_checksum'] = doc.archive_checksum + if doc.has_archive_version: meta['archive_size'] = os.stat(doc.archive_path).st_size, meta['archive_metadata'] = self.get_metadata( doc.archive_path, "application/pdf") else: - meta['archive_checksum'] = None meta['archive_size'] = None meta['archive_metadata'] = None