archive filenames are now stored in the database and checked for collisions just as original filenames as well, unified method for archive version checking

This commit is contained in:
jonaswinkler
2021-02-09 19:46:19 +01:00
parent 05f59e7d5e
commit fca8576d80
9 changed files with 229 additions and 105 deletions

View File

@@ -1,43 +1,27 @@
# Generated by Django 3.1.6 on 2021-02-07 22:26
import datetime
import hashlib
import logging
import os
import shutil
import pathvalidate
from django.conf import settings
from django.db import migrations
from django.db import migrations, models
from django.template.defaultfilters import slugify
from documents.file_handling import defaultdictNoStr, many_to_dictionary
logger = logging.getLogger("paperless.migrations")
def archive_name_from_filename_old(filename):
def archive_name_from_filename(filename):
return os.path.splitext(filename)[0] + ".pdf"
def archive_path_old(doc):
if doc.filename:
fname = archive_name_from_filename_old(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
def archive_name_from_filename_new(filename):
name, ext = os.path.splitext(filename)
if ext == ".pdf":
return filename
else:
return filename + ".pdf"
def archive_path_new(doc):
if doc.filename:
fname = archive_name_from_filename_new(doc.filename)
fname = archive_name_from_filename(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
@@ -50,6 +34,16 @@ def archive_path_new(doc):
STORAGE_TYPE_GPG = "gpg"
def archive_path_new(doc):
if doc.archive_filename is not None:
return os.path.join(
settings.ARCHIVE_DIR,
str(doc.archive_filename)
)
else:
return None
def source_path(doc):
if doc.filename:
fname = str(doc.filename)
@@ -64,6 +58,98 @@ def source_path(doc):
)
def generate_unique_filename(doc, archive_filename=False):
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
counter = 0
while True:
new_filename = generate_filename(
doc, counter, archive_filename=archive_filename)
if new_filename == old_filename:
# still the same as before.
return new_filename
if os.path.exists(os.path.join(root, new_filename)):
counter += 1
else:
return new_filename
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdictNoStr(lambda: slugify(None),
many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
",".join(sorted(
[tag.name for tag in doc.tags.all()]
)),
replacement_text="-"
)
if doc.correspondent:
correspondent = pathvalidate.sanitize_filename(
doc.correspondent.name, replacement_text="-"
)
else:
correspondent = "none"
if doc.document_type:
document_type = pathvalidate.sanitize_filename(
doc.document_type.name, replacement_text="-"
)
else:
document_type = "none"
path = settings.PAPERLESS_FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(
doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
tags=tags,
tag_list=tag_list
).strip()
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
def move_old_to_new_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
@@ -74,18 +160,12 @@ def move_old_to_new_locations(apps, schema_editor):
# check for documents that have incorrect archive versions
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
new_path = archive_path_new(doc)
if not os.path.isfile(old_path):
raise ValueError(
f"Archived document of {doc.filename} does not exist at: "
f"{old_path}")
if old_path != new_path and os.path.isfile(new_path):
raise ValueError(
f"Need to move {old_path} to {new_path}, but target file "
f"already exists")
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
@@ -103,22 +183,19 @@ def move_old_to_new_locations(apps, schema_editor):
f"document {doc.filename} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.")
# move files
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
new_path = archive_path_new(doc)
if doc.id in affected_document_ids:
old_path = archive_path_old(doc)
# remove affected archive versions
if os.path.isfile(old_path):
os.unlink(old_path)
else:
# move unaffected archive versions
if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path):
logger.debug(
f"Moving {old_path} to {new_path}"
)
shutil.move(old_path, new_path)
# Set archive path for unaffected files
doc.archive_filename = archive_path_old(doc)
Document.objects.filter(id=doc.id).update(
archive_filename=doc.archive_filename
)
# regenerate archive documents
for doc_id in affected_document_ids:
@@ -135,14 +212,16 @@ def move_old_to_new_locations(apps, schema_editor):
try:
parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
doc.content = parser.get_text()
if parser.archive_path and os.path.isfile(parser.archive_path):
with open(parser.archive_path, "rb") as f:
if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
doc.archive_filename = generate_unique_filename(
doc, archive_filename=True)
with open(parser.get_archive_path(), "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
shutil.copy2(parser.archive_path, archive_path_new(doc))
os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
if os.path.isfile(archive_path_new(doc)):
os.unlink(archive_path_new(doc))
doc.save()
except ParseError:
logger.exception(
@@ -187,8 +266,18 @@ class Migration(migrations.Migration):
]
operations = [
migrations.AddField(
model_name='document',
name='archive_filename',
field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
),
migrations.AlterField(
model_name='document',
name='filename',
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
),
migrations.RunPython(
move_old_to_new_locations,
move_new_to_old_locations
)
),
]