From fca8576d80cc99312b15b4d512cd528e618039d5 Mon Sep 17 00:00:00 2001
From: jonaswinkler <jonas.winkler@jpwinkler.de>
Date: Tue, 9 Feb 2021 19:46:19 +0100
Subject: [PATCH] archive filenames are now stored in the database and checked
 for collisions just as original filenames as well, unified method for archive
 version checking

---
 src/documents/consumer.py                     |   7 +-
 src/documents/file_handling.py                |  31 +--
 .../management/commands/document_archiver.py  |  14 +-
 .../management/commands/document_exporter.py  |   2 +-
 .../migrations/1012_fix_archive_files.py      | 177 +++++++++++++-----
 src/documents/models.py                       |  33 +++-
 src/documents/sanity_checker.py               |   2 +-
 src/documents/signals/handlers.py             |  56 +++---
 src/documents/views.py                        |  12 +-
 9 files changed, 229 insertions(+), 105 deletions(-)

diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 3baedb5c3..acb3ad33f 100755
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -292,8 +292,7 @@ class Consumer(LoggingMixin):
                 # After everything is in the database, copy the files into
                 # place. If this fails, we'll also rollback the transaction.
                 with FileLock(settings.MEDIA_LOCK):
-                    document.filename = generate_unique_filename(
-                        document, settings.ORIGINALS_DIR)
+                    document.filename = generate_unique_filename(document)
                     create_source_path_directory(document.source_path)
 
                     self._write(document.storage_type,
@@ -303,6 +302,10 @@ class Consumer(LoggingMixin):
                                 thumbnail, document.thumbnail_path)
 
                     if archive_path and os.path.isfile(archive_path):
+                        document.archive_filename = generate_unique_filename(
+                            document,
+                            archive_filename=True
+                        )
                         create_source_path_directory(document.archive_path)
                         self._write(document.storage_type,
                                     archive_path, document.archive_path)
diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py
index 64858de78..535aa3d2c 100644
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -79,12 +79,20 @@ def many_to_dictionary(field):
     return mydictionary
 
 
-def generate_unique_filename(doc, root):
+def generate_unique_filename(doc, archive_filename=False):
+    if archive_filename:
+        old_filename = doc.archive_filename
+        root = settings.ARCHIVE_DIR
+    else:
+        old_filename = doc.filename
+        root = settings.ORIGINALS_DIR
+
     counter = 0
 
     while True:
-        new_filename = generate_filename(doc, counter)
-        if new_filename == doc.filename:
+        new_filename = generate_filename(
+            doc, counter, archive_filename=archive_filename)
+        if new_filename == old_filename:
             # still the same as before.
             return new_filename
 
@@ -94,7 +102,7 @@ def generate_unique_filename(doc, root):
             return new_filename
 
 
-def generate_filename(doc, counter=0, append_gpg=True):
+def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
     path = ""
 
     try:
@@ -148,21 +156,16 @@ def generate_filename(doc, counter=0, append_gpg=True):
             f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
 
     counter_str = f"_{counter:02}" if counter else ""
+
+    filetype_str = ".pdf" if archive_filename else doc.file_type
+
     if len(path) > 0:
-        filename = f"{path}{counter_str}{doc.file_type}"
+        filename = f"{path}{counter_str}{filetype_str}"
     else:
-        filename = f"{doc.pk:07}{counter_str}{doc.file_type}"
+        filename = f"{doc.pk:07}{counter_str}{filetype_str}"
 
     # Append .gpg for encrypted files
     if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
         filename += ".gpg"
 
     return filename
-
-
-def archive_name_from_filename(filename):
-    name, ext = os.path.splitext(filename)
-    if ext == ".pdf":
-        return filename
-    else:
-        return filename + ".pdf"
diff --git a/src/documents/management/commands/document_archiver.py b/src/documents/management/commands/document_archiver.py
index d2ff9c8c2..fe8c8b530 100644
--- a/src/documents/management/commands/document_archiver.py
+++ b/src/documents/management/commands/document_archiver.py
@@ -16,7 +16,8 @@ from whoosh.writing import AsyncWriter
 
 from documents.models import Document
 from ... import index
-from ...file_handling import create_source_path_directory
+from ...file_handling import create_source_path_directory, \
+    generate_unique_filename
 from ...parsers import get_parser_class_for_mime_type
 
 
@@ -39,13 +40,16 @@ def handle_document(document_id):
             with transaction.atomic():
                 with open(parser.get_archive_path(), 'rb') as f:
                     checksum = hashlib.md5(f.read()).hexdigest()
-                # i'm going to save first so that in case the file move
+                # I'm going to save first so that in case the file move
                 # fails, the database is rolled back.
-                # we also don't use save() since that triggers the filehandling
+                # We also don't use save() since that triggers the filehandling
                 # logic, and we don't want that yet (file not yet in place)
+                document.archive_filename = generate_unique_filename(
+                    document, archive_filename=True)
                 Document.objects.filter(pk=document.pk).update(
                     archive_checksum=checksum,
-                    content=parser.get_text()
+                    content=parser.get_text(),
+                    archive_filename=document.archive_filename
                 )
                 with FileLock(settings.MEDIA_LOCK):
                     create_source_path_directory(document.archive_path)
@@ -101,7 +105,7 @@ class Command(BaseCommand):
         document_ids = list(map(
             lambda doc: doc.id,
             filter(
-                lambda d: overwrite or not d.archive_checksum,
+                lambda d: overwrite or not d.has_archive_version,
                 documents
             )
         ))
diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py
index 1505b0856..d8f5861a5 100644
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -139,7 +139,7 @@ class Command(BaseCommand):
             thumbnail_target = os.path.join(self.target, thumbnail_name)
             document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
 
-            if os.path.exists(document.archive_path):
+            if document.has_archive_version:
                 archive_name = base_name + "-archive.pdf"
                 archive_target = os.path.join(self.target, archive_name)
                 document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
diff --git a/src/documents/migrations/1012_fix_archive_files.py b/src/documents/migrations/1012_fix_archive_files.py
index f75ae431d..5db503391 100644
--- a/src/documents/migrations/1012_fix_archive_files.py
+++ b/src/documents/migrations/1012_fix_archive_files.py
@@ -1,43 +1,27 @@
 # Generated by Django 3.1.6 on 2021-02-07 22:26
+import datetime
 import hashlib
 import logging
 import os
 import shutil
 
+import pathvalidate
 from django.conf import settings
-from django.db import migrations
+from django.db import migrations, models
+from django.template.defaultfilters import slugify
 
+from documents.file_handling import defaultdictNoStr, many_to_dictionary
 
 logger = logging.getLogger("paperless.migrations")
 
 
-def archive_name_from_filename_old(filename):
+def archive_name_from_filename(filename):
     return os.path.splitext(filename)[0] + ".pdf"
 
 
 def archive_path_old(doc):
     if doc.filename:
-        fname = archive_name_from_filename_old(doc.filename)
-    else:
-        fname = "{:07}.pdf".format(doc.pk)
-
-    return os.path.join(
-        settings.ARCHIVE_DIR,
-        fname
-    )
-
-
-def archive_name_from_filename_new(filename):
-    name, ext = os.path.splitext(filename)
-    if ext == ".pdf":
-        return filename
-    else:
-        return filename + ".pdf"
-
-
-def archive_path_new(doc):
-    if doc.filename:
-        fname = archive_name_from_filename_new(doc.filename)
+        fname = archive_name_from_filename(doc.filename)
     else:
         fname = "{:07}.pdf".format(doc.pk)
 
@@ -50,6 +34,16 @@ def archive_path_new(doc):
 STORAGE_TYPE_GPG = "gpg"
 
 
+def archive_path_new(doc):
+    if doc.archive_filename is not None:
+        return os.path.join(
+            settings.ARCHIVE_DIR,
+            str(doc.archive_filename)
+        )
+    else:
+        return None
+
+
 def source_path(doc):
     if doc.filename:
         fname = str(doc.filename)
@@ -64,6 +58,98 @@ def source_path(doc):
     )
 
 
+def generate_unique_filename(doc, archive_filename=False):
+    if archive_filename:
+        old_filename = doc.archive_filename
+        root = settings.ARCHIVE_DIR
+    else:
+        old_filename = doc.filename
+        root = settings.ORIGINALS_DIR
+
+    counter = 0
+
+    while True:
+        new_filename = generate_filename(
+            doc, counter, archive_filename=archive_filename)
+        if new_filename == old_filename:
+            # still the same as before.
+            return new_filename
+
+        if os.path.exists(os.path.join(root, new_filename)):
+            counter += 1
+        else:
+            return new_filename
+
+
+def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
+    path = ""
+
+    try:
+        if settings.PAPERLESS_FILENAME_FORMAT is not None:
+            tags = defaultdictNoStr(lambda: slugify(None),
+                                    many_to_dictionary(doc.tags))
+
+            tag_list = pathvalidate.sanitize_filename(
+                ",".join(sorted(
+                    [tag.name for tag in doc.tags.all()]
+                )),
+                replacement_text="-"
+            )
+
+            if doc.correspondent:
+                correspondent = pathvalidate.sanitize_filename(
+                    doc.correspondent.name, replacement_text="-"
+                )
+            else:
+                correspondent = "none"
+
+            if doc.document_type:
+                document_type = pathvalidate.sanitize_filename(
+                    doc.document_type.name, replacement_text="-"
+                )
+            else:
+                document_type = "none"
+
+            path = settings.PAPERLESS_FILENAME_FORMAT.format(
+                title=pathvalidate.sanitize_filename(
+                    doc.title, replacement_text="-"),
+                correspondent=correspondent,
+                document_type=document_type,
+                created=datetime.date.isoformat(doc.created),
+                created_year=doc.created.year if doc.created else "none",
+                created_month=f"{doc.created.month:02}" if doc.created else "none",  # NOQA: E501
+                created_day=f"{doc.created.day:02}" if doc.created else "none",
+                added=datetime.date.isoformat(doc.added),
+                added_year=doc.added.year if doc.added else "none",
+                added_month=f"{doc.added.month:02}" if doc.added else "none",
+                added_day=f"{doc.added.day:02}" if doc.added else "none",
+                tags=tags,
+                tag_list=tag_list
+            ).strip()
+
+            path = path.strip(os.sep)
+
+    except (ValueError, KeyError, IndexError):
+        logger.warning(
+            f"Invalid PAPERLESS_FILENAME_FORMAT: "
+            f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
+
+    counter_str = f"_{counter:02}" if counter else ""
+
+    filetype_str = ".pdf" if archive_filename else doc.file_type
+
+    if len(path) > 0:
+        filename = f"{path}{counter_str}{filetype_str}"
+    else:
+        filename = f"{doc.pk:07}{counter_str}{filetype_str}"
+
+    # Append .gpg for encrypted files
+    if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
+        filename += ".gpg"
+
+    return filename
+
+
 def move_old_to_new_locations(apps, schema_editor):
     Document = apps.get_model("documents", "Document")
 
@@ -74,18 +160,12 @@ def move_old_to_new_locations(apps, schema_editor):
     # check for documents that have incorrect archive versions
     for doc in Document.objects.filter(archive_checksum__isnull=False):
         old_path = archive_path_old(doc)
-        new_path = archive_path_new(doc)
 
         if not os.path.isfile(old_path):
             raise ValueError(
                 f"Archived document of {doc.filename} does not exist at: "
                 f"{old_path}")
 
-        if old_path != new_path and os.path.isfile(new_path):
-            raise ValueError(
-                f"Need to move {old_path} to {new_path}, but target file "
-                f"already exists")
-
         if old_path in old_archive_path_to_id:
             affected_document_ids.add(doc.id)
             affected_document_ids.add(old_archive_path_to_id[old_path])
@@ -103,22 +183,19 @@ def move_old_to_new_locations(apps, schema_editor):
                 f"document {doc.filename} has an invalid archived document, "
                 f"but no parsers are available. Cannot migrate.")
 
-    # move files
     for doc in Document.objects.filter(archive_checksum__isnull=False):
-        old_path = archive_path_old(doc)
-        new_path = archive_path_new(doc)
 
         if doc.id in affected_document_ids:
+            old_path = archive_path_old(doc)
             # remove affected archive versions
             if os.path.isfile(old_path):
                 os.unlink(old_path)
         else:
-            # move unaffected archive versions
-            if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path):
-                logger.debug(
-                    f"Moving {old_path} to {new_path}"
-                )
-                shutil.move(old_path, new_path)
+            # Set archive path for unaffected files
+            doc.archive_filename = archive_path_old(doc)
+            Document.objects.filter(id=doc.id).update(
+                archive_filename=doc.archive_filename
+            )
 
     # regenerate archive documents
     for doc_id in affected_document_ids:
@@ -135,14 +212,16 @@ def move_old_to_new_locations(apps, schema_editor):
         try:
             parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
             doc.content = parser.get_text()
-            if parser.archive_path and os.path.isfile(parser.archive_path):
-                with open(parser.archive_path, "rb") as f:
+
+            if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
+                doc.archive_filename = generate_unique_filename(
+                    doc, archive_filename=True)
+                with open(parser.get_archive_path(), "rb") as f:
                     doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
-                shutil.copy2(parser.archive_path, archive_path_new(doc))
+                os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
+                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
             else:
                 doc.archive_checksum = None
-                if os.path.isfile(archive_path_new(doc)):
-                    os.unlink(archive_path_new(doc))
             doc.save()
         except ParseError:
             logger.exception(
@@ -187,8 +266,18 @@ class Migration(migrations.Migration):
     ]
 
     operations = [
+        migrations.AddField(
+            model_name='document',
+            name='archive_filename',
+            field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
+        ),
+        migrations.AlterField(
+            model_name='document',
+            name='filename',
+            field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
+        ),
         migrations.RunPython(
             move_old_to_new_locations,
             move_new_to_old_locations
-        )
+        ),
     ]
diff --git a/src/documents/models.py b/src/documents/models.py
index 86878dd7e..47433724a 100755
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -16,7 +16,6 @@ from django.utils.timezone import is_aware
 
 from django.utils.translation import gettext_lazy as _
 
-from documents.file_handling import archive_name_from_filename
 from documents.parsers import get_default_file_extension
 
 
@@ -208,10 +207,21 @@ class Document(models.Model):
         max_length=1024,
         editable=False,
         default=None,
+        unique=True,
         null=True,
         help_text=_("Current filename in storage")
     )
 
+    archive_filename = models.FilePathField(
+        _("archive filename"),
+        max_length=1024,
+        editable=False,
+        default=None,
+        unique=True,
+        null=True,
+        help_text=_("Current archive filename in storage")
+    )
+
     archive_serial_number = models.IntegerField(
         _("archive serial number"),
         blank=True,
@@ -256,16 +266,19 @@ class Document(models.Model):
         return open(self.source_path, "rb")
 
     @property
-    def archive_path(self):
-        if self.filename:
-            fname = archive_name_from_filename(self.filename)
-        else:
-            fname = "{:07}.pdf".format(self.pk)
+    def has_archive_version(self):
+        return self.archive_filename is not None
+
+    @property
+    def archive_path(self):
+        if self.has_archive_version:
+            return os.path.join(
+                settings.ARCHIVE_DIR,
+                str(self.archive_filename)
+            )
+        else:
+            return None
 
-        return os.path.join(
-            settings.ARCHIVE_DIR,
-            fname
-        )
 
     @property
     def archive_file(self):
diff --git a/src/documents/sanity_checker.py b/src/documents/sanity_checker.py
index b8fd73f98..0b385e81b 100644
--- a/src/documents/sanity_checker.py
+++ b/src/documents/sanity_checker.py
@@ -88,7 +88,7 @@ def check_sanity():
                     ))
 
         # Check sanity of the archive file.
-        if doc.archive_checksum:
+        if doc.has_archive_version:
             if not os.path.isfile(doc.archive_path):
                 messages.append(SanityError(
                     f"Archived version of document {doc.pk} does not exist."
diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py
index 848bcb900..5d46dc431 100755
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -14,7 +14,7 @@ from filelock import FileLock
 
 from .. import index, matching
 from ..file_handling import delete_empty_directories, \
-    create_source_path_directory, archive_name_from_filename, \
+    create_source_path_directory, \
     generate_unique_filename
 from ..models import Document, Tag
 
@@ -148,18 +148,18 @@ def set_tags(sender,
 @receiver(models.signals.post_delete, sender=Document)
 def cleanup_document_deletion(sender, instance, using, **kwargs):
     with FileLock(settings.MEDIA_LOCK):
-        for f in (instance.source_path,
-                  instance.archive_path,
-                  instance.thumbnail_path):
-            if os.path.isfile(f):
+        for filename in (instance.source_path,
+                         instance.archive_path,
+                         instance.thumbnail_path):
+            if filename and os.path.isfile(filename):
                 try:
-                    os.unlink(f)
+                    os.unlink(filename)
                     logger.debug(
-                        f"Deleted file {f}.")
+                        f"Deleted file {filename}.")
                 except OSError as e:
                     logger.warning(
                         f"While deleting document {str(instance)}, the file "
-                        f"{f} could not be deleted: {e}"
+                        f"{filename} could not be deleted: {e}"
                     )
 
         delete_empty_directories(
@@ -167,10 +167,11 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
             root=settings.ORIGINALS_DIR
         )
 
-        delete_empty_directories(
-            os.path.dirname(instance.archive_path),
-            root=settings.ARCHIVE_DIR
-        )
+        if instance.has_archive_version:
+            delete_empty_directories(
+                os.path.dirname(instance.archive_path),
+                root=settings.ARCHIVE_DIR
+            )
 
 
 def validate_move(instance, old_path, new_path):
@@ -207,8 +208,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
 
     with FileLock(settings.MEDIA_LOCK):
         old_filename = instance.filename
-        new_filename = generate_unique_filename(
-            instance, settings.ORIGINALS_DIR)
+        new_filename = generate_unique_filename(instance)
 
         if new_filename == instance.filename:
             # Don't do anything if its the same.
@@ -222,8 +222,11 @@ def update_filename_and_move_files(sender, instance, **kwargs):
 
         # archive files are optional, archive checksum tells us if we have one,
         # since this is None for documents without archived files.
-        if instance.archive_checksum:
-            new_archive_filename = archive_name_from_filename(new_filename)
+        if instance.has_archive_version:
+            old_archive_filename = instance.archive_filename
+            new_archive_filename = generate_unique_filename(
+                instance, archive_filename=True
+            )
             old_archive_path = instance.archive_path
             new_archive_path = os.path.join(settings.ARCHIVE_DIR,
                                             new_archive_filename)
@@ -233,6 +236,8 @@ def update_filename_and_move_files(sender, instance, **kwargs):
 
             create_source_path_directory(new_archive_path)
         else:
+            old_archive_filename = None
+            new_archive_filename = None
             old_archive_path = None
             new_archive_path = None
 
@@ -240,22 +245,28 @@ def update_filename_and_move_files(sender, instance, **kwargs):
 
         try:
             os.rename(old_source_path, new_source_path)
-            if instance.archive_checksum:
-                os.rename(old_archive_path, new_archive_path)
             instance.filename = new_filename
 
+            if instance.has_archive_version:
+                os.rename(old_archive_path, new_archive_path)
+                instance.archive_filename = new_archive_filename
+
             # Don't save() here to prevent infinite recursion.
             Document.objects.filter(pk=instance.pk).update(
-                filename=new_filename)
+                filename=instance.filename,
+                archive_filename=instance.archive_filename,
+            )
 
         except OSError as e:
             instance.filename = old_filename
+            instance.archive_filename = old_archive_filename
             # this happens when we can't move a file. If that's the case for
             # the archive file, we try our best to revert the changes.
             # no need to save the instance, the update() has not happened yet.
             try:
                 os.rename(new_source_path, old_source_path)
-                os.rename(new_archive_path, old_archive_path)
+                if instance.has_archive_version:
+                    os.rename(new_archive_path, old_archive_path)
             except Exception as e:
                 # This is fine, since:
                 # A: if we managed to move source from A to B, we will also
@@ -271,9 +282,10 @@ def update_filename_and_move_files(sender, instance, **kwargs):
             # since moving them once succeeded, it's very likely going to
             # succeed again.
             os.rename(new_source_path, old_source_path)
-            if instance.archive_checksum:
+            if instance.has_archive_version:
                 os.rename(new_archive_path, old_archive_path)
             instance.filename = old_filename
+            instance.archive_filename = old_archive_filename
             # again, no need to save the instance, since the actual update()
             # operation failed.
 
@@ -283,7 +295,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
             delete_empty_directories(os.path.dirname(old_source_path),
                                      root=settings.ORIGINALS_DIR)
 
-        if old_archive_path and not os.path.isfile(old_archive_path):
+        if instance.has_archive_version and not os.path.isfile(old_archive_path):  # NOQA: E501
             delete_empty_directories(os.path.dirname(old_archive_path),
                                      root=settings.ARCHIVE_DIR)
 
diff --git a/src/documents/views.py b/src/documents/views.py
index 5ab4ca9df..3a840567d 100755
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -192,7 +192,7 @@ class DocumentViewSet(RetrieveModelMixin,
 
     def file_response(self, pk, request, disposition):
         doc = Document.objects.get(id=pk)
-        if not self.original_requested(request) and os.path.isfile(doc.archive_path):  # NOQA: E501
+        if not self.original_requested(request) and doc.has_archive_version:  # NOQA: E501
             file_handle = doc.archive_file
             filename = doc.get_public_filename(archive=True)
             mime_type = 'application/pdf'
@@ -237,18 +237,18 @@ class DocumentViewSet(RetrieveModelMixin,
             "original_size": os.stat(doc.source_path).st_size,
             "original_mime_type": doc.mime_type,
             "media_filename": doc.filename,
-            "has_archive_version": os.path.isfile(doc.archive_path),
+            "has_archive_version": doc.has_archive_version,
             "original_metadata": self.get_metadata(
-                doc.source_path, doc.mime_type)
+                doc.source_path, doc.mime_type),
+            "archive_checksum": doc.archive_checksum,
+            "archive_media_filename": doc.archive_filename
         }
 
-        if doc.archive_checksum and os.path.isfile(doc.archive_path):
-            meta['archive_checksum'] = doc.archive_checksum
+        if doc.has_archive_version:
             meta['archive_size'] = os.stat(doc.archive_path).st_size,
             meta['archive_metadata'] = self.get_metadata(
                 doc.archive_path, "application/pdf")
         else:
-            meta['archive_checksum'] = None
             meta['archive_size'] = None
             meta['archive_metadata'] = None