archive filenames are now stored in the database and checked for collisions just as original filenames as well, unified method for archive version checking

This commit is contained in:
jonaswinkler 2021-02-09 19:46:19 +01:00
parent 05f59e7d5e
commit fca8576d80
9 changed files with 229 additions and 105 deletions

View File

@ -292,8 +292,7 @@ class Consumer(LoggingMixin):
# After everything is in the database, copy the files into
# place. If this fails, we'll also rollback the transaction.
with FileLock(settings.MEDIA_LOCK):
document.filename = generate_unique_filename(
document, settings.ORIGINALS_DIR)
document.filename = generate_unique_filename(document)
create_source_path_directory(document.source_path)
self._write(document.storage_type,
@ -303,6 +302,10 @@ class Consumer(LoggingMixin):
thumbnail, document.thumbnail_path)
if archive_path and os.path.isfile(archive_path):
document.archive_filename = generate_unique_filename(
document,
archive_filename=True
)
create_source_path_directory(document.archive_path)
self._write(document.storage_type,
archive_path, document.archive_path)

View File

@ -79,12 +79,20 @@ def many_to_dictionary(field):
return mydictionary
def generate_unique_filename(doc, root):
def generate_unique_filename(doc, archive_filename=False):
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
counter = 0
while True:
new_filename = generate_filename(doc, counter)
if new_filename == doc.filename:
new_filename = generate_filename(
doc, counter, archive_filename=archive_filename)
if new_filename == old_filename:
# still the same as before.
return new_filename
@ -94,7 +102,7 @@ def generate_unique_filename(doc, root):
return new_filename
def generate_filename(doc, counter=0, append_gpg=True):
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
@ -148,21 +156,16 @@ def generate_filename(doc, counter=0, append_gpg=True):
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{doc.file_type}"
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{doc.file_type}"
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
def archive_name_from_filename(filename):
name, ext = os.path.splitext(filename)
if ext == ".pdf":
return filename
else:
return filename + ".pdf"

View File

@ -16,7 +16,8 @@ from whoosh.writing import AsyncWriter
from documents.models import Document
from ... import index
from ...file_handling import create_source_path_directory
from ...file_handling import create_source_path_directory, \
generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
@ -39,13 +40,16 @@ def handle_document(document_id):
with transaction.atomic():
with open(parser.get_archive_path(), 'rb') as f:
checksum = hashlib.md5(f.read()).hexdigest()
# i'm going to save first so that in case the file move
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# we also don't use save() since that triggers the filehandling
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document, archive_filename=True)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text()
content=parser.get_text(),
archive_filename=document.archive_filename
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
@ -101,7 +105,7 @@ class Command(BaseCommand):
document_ids = list(map(
lambda doc: doc.id,
filter(
lambda d: overwrite or not d.archive_checksum,
lambda d: overwrite or not d.has_archive_version,
documents
)
))

View File

@ -139,7 +139,7 @@ class Command(BaseCommand):
thumbnail_target = os.path.join(self.target, thumbnail_name)
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
if os.path.exists(document.archive_path):
if document.has_archive_version:
archive_name = base_name + "-archive.pdf"
archive_target = os.path.join(self.target, archive_name)
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name

View File

@ -1,43 +1,27 @@
# Generated by Django 3.1.6 on 2021-02-07 22:26
import datetime
import hashlib
import logging
import os
import shutil
import pathvalidate
from django.conf import settings
from django.db import migrations
from django.db import migrations, models
from django.template.defaultfilters import slugify
from documents.file_handling import defaultdictNoStr, many_to_dictionary
logger = logging.getLogger("paperless.migrations")
def archive_name_from_filename_old(filename):
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"
def archive_path_old(doc):
if doc.filename:
fname = archive_name_from_filename_old(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
def archive_name_from_filename_new(filename):
name, ext = os.path.splitext(filename)
if ext == ".pdf":
return filename
else:
return filename + ".pdf"
def archive_path_new(doc):
if doc.filename:
fname = archive_name_from_filename_new(doc.filename)
fname = archive_name_from_filename(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
@ -50,6 +34,16 @@ def archive_path_new(doc):
STORAGE_TYPE_GPG = "gpg"
def archive_path_new(doc):
if doc.archive_filename is not None:
return os.path.join(
settings.ARCHIVE_DIR,
str(doc.archive_filename)
)
else:
return None
def source_path(doc):
if doc.filename:
fname = str(doc.filename)
@ -64,6 +58,98 @@ def source_path(doc):
)
def generate_unique_filename(doc, archive_filename=False):
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
counter = 0
while True:
new_filename = generate_filename(
doc, counter, archive_filename=archive_filename)
if new_filename == old_filename:
# still the same as before.
return new_filename
if os.path.exists(os.path.join(root, new_filename)):
counter += 1
else:
return new_filename
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdictNoStr(lambda: slugify(None),
many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
",".join(sorted(
[tag.name for tag in doc.tags.all()]
)),
replacement_text="-"
)
if doc.correspondent:
correspondent = pathvalidate.sanitize_filename(
doc.correspondent.name, replacement_text="-"
)
else:
correspondent = "none"
if doc.document_type:
document_type = pathvalidate.sanitize_filename(
doc.document_type.name, replacement_text="-"
)
else:
document_type = "none"
path = settings.PAPERLESS_FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(
doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
tags=tags,
tag_list=tag_list
).strip()
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
def move_old_to_new_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
@ -74,18 +160,12 @@ def move_old_to_new_locations(apps, schema_editor):
# check for documents that have incorrect archive versions
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
new_path = archive_path_new(doc)
if not os.path.isfile(old_path):
raise ValueError(
f"Archived document of {doc.filename} does not exist at: "
f"{old_path}")
if old_path != new_path and os.path.isfile(new_path):
raise ValueError(
f"Need to move {old_path} to {new_path}, but target file "
f"already exists")
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
@ -103,22 +183,19 @@ def move_old_to_new_locations(apps, schema_editor):
f"document {doc.filename} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.")
# move files
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
new_path = archive_path_new(doc)
if doc.id in affected_document_ids:
old_path = archive_path_old(doc)
# remove affected archive versions
if os.path.isfile(old_path):
os.unlink(old_path)
else:
# move unaffected archive versions
if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path):
logger.debug(
f"Moving {old_path} to {new_path}"
)
shutil.move(old_path, new_path)
# Set archive path for unaffected files
doc.archive_filename = archive_path_old(doc)
Document.objects.filter(id=doc.id).update(
archive_filename=doc.archive_filename
)
# regenerate archive documents
for doc_id in affected_document_ids:
@ -135,14 +212,16 @@ def move_old_to_new_locations(apps, schema_editor):
try:
parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
doc.content = parser.get_text()
if parser.archive_path and os.path.isfile(parser.archive_path):
with open(parser.archive_path, "rb") as f:
if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
doc.archive_filename = generate_unique_filename(
doc, archive_filename=True)
with open(parser.get_archive_path(), "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
shutil.copy2(parser.archive_path, archive_path_new(doc))
os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
if os.path.isfile(archive_path_new(doc)):
os.unlink(archive_path_new(doc))
doc.save()
except ParseError:
logger.exception(
@ -187,8 +266,18 @@ class Migration(migrations.Migration):
]
operations = [
migrations.AddField(
model_name='document',
name='archive_filename',
field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
),
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
),
migrations.RunPython(
move_old_to_new_locations,
move_new_to_old_locations
)
),
]

View File

@ -16,7 +16,6 @@ from django.utils.timezone import is_aware
from django.utils.translation import gettext_lazy as _
from documents.file_handling import archive_name_from_filename
from documents.parsers import get_default_file_extension
@ -208,10 +207,21 @@ class Document(models.Model):
max_length=1024,
editable=False,
default=None,
unique=True,
null=True,
help_text=_("Current filename in storage")
)
archive_filename = models.FilePathField(
_("archive filename"),
max_length=1024,
editable=False,
default=None,
unique=True,
null=True,
help_text=_("Current archive filename in storage")
)
archive_serial_number = models.IntegerField(
_("archive serial number"),
blank=True,
@ -256,16 +266,19 @@ class Document(models.Model):
return open(self.source_path, "rb")
@property
def archive_path(self):
if self.filename:
fname = archive_name_from_filename(self.filename)
else:
fname = "{:07}.pdf".format(self.pk)
def has_archive_version(self):
return self.archive_filename is not None
@property
def archive_path(self):
if self.has_archive_version:
return os.path.join(
settings.ARCHIVE_DIR,
str(self.archive_filename)
)
else:
return None
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
@property
def archive_file(self):

View File

@ -88,7 +88,7 @@ def check_sanity():
))
# Check sanity of the archive file.
if doc.archive_checksum:
if doc.has_archive_version:
if not os.path.isfile(doc.archive_path):
messages.append(SanityError(
f"Archived version of document {doc.pk} does not exist."

View File

@ -14,7 +14,7 @@ from filelock import FileLock
from .. import index, matching
from ..file_handling import delete_empty_directories, \
create_source_path_directory, archive_name_from_filename, \
create_source_path_directory, \
generate_unique_filename
from ..models import Document, Tag
@ -148,18 +148,18 @@ def set_tags(sender,
@receiver(models.signals.post_delete, sender=Document)
def cleanup_document_deletion(sender, instance, using, **kwargs):
with FileLock(settings.MEDIA_LOCK):
for f in (instance.source_path,
instance.archive_path,
instance.thumbnail_path):
if os.path.isfile(f):
for filename in (instance.source_path,
instance.archive_path,
instance.thumbnail_path):
if filename and os.path.isfile(filename):
try:
os.unlink(f)
os.unlink(filename)
logger.debug(
f"Deleted file {f}.")
f"Deleted file {filename}.")
except OSError as e:
logger.warning(
f"While deleting document {str(instance)}, the file "
f"{f} could not be deleted: {e}"
f"{filename} could not be deleted: {e}"
)
delete_empty_directories(
@ -167,10 +167,11 @@ def cleanup_document_deletion(sender, instance, using, **kwargs):
root=settings.ORIGINALS_DIR
)
delete_empty_directories(
os.path.dirname(instance.archive_path),
root=settings.ARCHIVE_DIR
)
if instance.has_archive_version:
delete_empty_directories(
os.path.dirname(instance.archive_path),
root=settings.ARCHIVE_DIR
)
def validate_move(instance, old_path, new_path):
@ -207,8 +208,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
with FileLock(settings.MEDIA_LOCK):
old_filename = instance.filename
new_filename = generate_unique_filename(
instance, settings.ORIGINALS_DIR)
new_filename = generate_unique_filename(instance)
if new_filename == instance.filename:
# Don't do anything if its the same.
@ -222,8 +222,11 @@ def update_filename_and_move_files(sender, instance, **kwargs):
# archive files are optional, archive checksum tells us if we have one,
# since this is None for documents without archived files.
if instance.archive_checksum:
new_archive_filename = archive_name_from_filename(new_filename)
if instance.has_archive_version:
old_archive_filename = instance.archive_filename
new_archive_filename = generate_unique_filename(
instance, archive_filename=True
)
old_archive_path = instance.archive_path
new_archive_path = os.path.join(settings.ARCHIVE_DIR,
new_archive_filename)
@ -233,6 +236,8 @@ def update_filename_and_move_files(sender, instance, **kwargs):
create_source_path_directory(new_archive_path)
else:
old_archive_filename = None
new_archive_filename = None
old_archive_path = None
new_archive_path = None
@ -240,22 +245,28 @@ def update_filename_and_move_files(sender, instance, **kwargs):
try:
os.rename(old_source_path, new_source_path)
if instance.archive_checksum:
os.rename(old_archive_path, new_archive_path)
instance.filename = new_filename
if instance.has_archive_version:
os.rename(old_archive_path, new_archive_path)
instance.archive_filename = new_archive_filename
# Don't save() here to prevent infinite recursion.
Document.objects.filter(pk=instance.pk).update(
filename=new_filename)
filename=instance.filename,
archive_filename=instance.archive_filename,
)
except OSError as e:
instance.filename = old_filename
instance.archive_filename = old_archive_filename
# this happens when we can't move a file. If that's the case for
# the archive file, we try our best to revert the changes.
# no need to save the instance, the update() has not happened yet.
try:
os.rename(new_source_path, old_source_path)
os.rename(new_archive_path, old_archive_path)
if instance.has_archive_version:
os.rename(new_archive_path, old_archive_path)
except Exception as e:
# This is fine, since:
# A: if we managed to move source from A to B, we will also
@ -271,9 +282,10 @@ def update_filename_and_move_files(sender, instance, **kwargs):
# since moving them once succeeded, it's very likely going to
# succeed again.
os.rename(new_source_path, old_source_path)
if instance.archive_checksum:
if instance.has_archive_version:
os.rename(new_archive_path, old_archive_path)
instance.filename = old_filename
instance.archive_filename = old_archive_filename
# again, no need to save the instance, since the actual update()
# operation failed.
@ -283,7 +295,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
delete_empty_directories(os.path.dirname(old_source_path),
root=settings.ORIGINALS_DIR)
if old_archive_path and not os.path.isfile(old_archive_path):
if instance.has_archive_version and not os.path.isfile(old_archive_path): # NOQA: E501
delete_empty_directories(os.path.dirname(old_archive_path),
root=settings.ARCHIVE_DIR)

View File

@ -192,7 +192,7 @@ class DocumentViewSet(RetrieveModelMixin,
def file_response(self, pk, request, disposition):
doc = Document.objects.get(id=pk)
if not self.original_requested(request) and os.path.isfile(doc.archive_path): # NOQA: E501
if not self.original_requested(request) and doc.has_archive_version: # NOQA: E501
file_handle = doc.archive_file
filename = doc.get_public_filename(archive=True)
mime_type = 'application/pdf'
@ -237,18 +237,18 @@ class DocumentViewSet(RetrieveModelMixin,
"original_size": os.stat(doc.source_path).st_size,
"original_mime_type": doc.mime_type,
"media_filename": doc.filename,
"has_archive_version": os.path.isfile(doc.archive_path),
"has_archive_version": doc.has_archive_version,
"original_metadata": self.get_metadata(
doc.source_path, doc.mime_type)
doc.source_path, doc.mime_type),
"archive_checksum": doc.archive_checksum,
"archive_media_filename": doc.archive_filename
}
if doc.archive_checksum and os.path.isfile(doc.archive_path):
meta['archive_checksum'] = doc.archive_checksum
if doc.has_archive_version:
meta['archive_size'] = os.stat(doc.archive_path).st_size,
meta['archive_metadata'] = self.get_metadata(
doc.archive_path, "application/pdf")
else:
meta['archive_checksum'] = None
meta['archive_size'] = None
meta['archive_metadata'] = None