mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-10 00:18:57 +00:00
archive filenames are now stored in the database and checked for collisions just as original filenames as well, unified method for archive version checking
This commit is contained in:
@@ -1,43 +1,27 @@
|
||||
# Generated by Django 3.1.6 on 2021-02-07 22:26
|
||||
import datetime
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
|
||||
import pathvalidate
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import migrations, models
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
from documents.file_handling import defaultdictNoStr, many_to_dictionary
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
|
||||
def archive_name_from_filename_old(filename):
|
||||
def archive_name_from_filename(filename):
|
||||
return os.path.splitext(filename)[0] + ".pdf"
|
||||
|
||||
|
||||
def archive_path_old(doc):
|
||||
if doc.filename:
|
||||
fname = archive_name_from_filename_old(doc.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(doc.pk)
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def archive_name_from_filename_new(filename):
|
||||
name, ext = os.path.splitext(filename)
|
||||
if ext == ".pdf":
|
||||
return filename
|
||||
else:
|
||||
return filename + ".pdf"
|
||||
|
||||
|
||||
def archive_path_new(doc):
|
||||
if doc.filename:
|
||||
fname = archive_name_from_filename_new(doc.filename)
|
||||
fname = archive_name_from_filename(doc.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(doc.pk)
|
||||
|
||||
@@ -50,6 +34,16 @@ def archive_path_new(doc):
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
|
||||
|
||||
def archive_path_new(doc):
|
||||
if doc.archive_filename is not None:
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
str(doc.archive_filename)
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def source_path(doc):
|
||||
if doc.filename:
|
||||
fname = str(doc.filename)
|
||||
@@ -64,6 +58,98 @@ def source_path(doc):
|
||||
)
|
||||
|
||||
|
||||
def generate_unique_filename(doc, archive_filename=False):
|
||||
if archive_filename:
|
||||
old_filename = doc.archive_filename
|
||||
root = settings.ARCHIVE_DIR
|
||||
else:
|
||||
old_filename = doc.filename
|
||||
root = settings.ORIGINALS_DIR
|
||||
|
||||
counter = 0
|
||||
|
||||
while True:
|
||||
new_filename = generate_filename(
|
||||
doc, counter, archive_filename=archive_filename)
|
||||
if new_filename == old_filename:
|
||||
# still the same as before.
|
||||
return new_filename
|
||||
|
||||
if os.path.exists(os.path.join(root, new_filename)):
|
||||
counter += 1
|
||||
else:
|
||||
return new_filename
|
||||
|
||||
|
||||
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdictNoStr(lambda: slugify(None),
|
||||
many_to_dictionary(doc.tags))
|
||||
|
||||
tag_list = pathvalidate.sanitize_filename(
|
||||
",".join(sorted(
|
||||
[tag.name for tag in doc.tags.all()]
|
||||
)),
|
||||
replacement_text="-"
|
||||
)
|
||||
|
||||
if doc.correspondent:
|
||||
correspondent = pathvalidate.sanitize_filename(
|
||||
doc.correspondent.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
correspondent = "none"
|
||||
|
||||
if doc.document_type:
|
||||
document_type = pathvalidate.sanitize_filename(
|
||||
doc.document_type.name, replacement_text="-"
|
||||
)
|
||||
else:
|
||||
document_type = "none"
|
||||
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
title=pathvalidate.sanitize_filename(
|
||||
doc.title, replacement_text="-"),
|
||||
correspondent=correspondent,
|
||||
document_type=document_type,
|
||||
created=datetime.date.isoformat(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=f"{doc.created.month:02}" if doc.created else "none", # NOQA: E501
|
||||
created_day=f"{doc.created.day:02}" if doc.created else "none",
|
||||
added=datetime.date.isoformat(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=f"{doc.added.month:02}" if doc.added else "none",
|
||||
added_day=f"{doc.added.day:02}" if doc.added else "none",
|
||||
tags=tags,
|
||||
tag_list=tag_list
|
||||
).strip()
|
||||
|
||||
path = path.strip(os.sep)
|
||||
|
||||
except (ValueError, KeyError, IndexError):
|
||||
logger.warning(
|
||||
f"Invalid PAPERLESS_FILENAME_FORMAT: "
|
||||
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")
|
||||
|
||||
counter_str = f"_{counter:02}" if counter else ""
|
||||
|
||||
filetype_str = ".pdf" if archive_filename else doc.file_type
|
||||
|
||||
if len(path) > 0:
|
||||
filename = f"{path}{counter_str}{filetype_str}"
|
||||
else:
|
||||
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def move_old_to_new_locations(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
@@ -74,18 +160,12 @@ def move_old_to_new_locations(apps, schema_editor):
|
||||
# check for documents that have incorrect archive versions
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
old_path = archive_path_old(doc)
|
||||
new_path = archive_path_new(doc)
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
raise ValueError(
|
||||
f"Archived document of {doc.filename} does not exist at: "
|
||||
f"{old_path}")
|
||||
|
||||
if old_path != new_path and os.path.isfile(new_path):
|
||||
raise ValueError(
|
||||
f"Need to move {old_path} to {new_path}, but target file "
|
||||
f"already exists")
|
||||
|
||||
if old_path in old_archive_path_to_id:
|
||||
affected_document_ids.add(doc.id)
|
||||
affected_document_ids.add(old_archive_path_to_id[old_path])
|
||||
@@ -103,22 +183,19 @@ def move_old_to_new_locations(apps, schema_editor):
|
||||
f"document {doc.filename} has an invalid archived document, "
|
||||
f"but no parsers are available. Cannot migrate.")
|
||||
|
||||
# move files
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
old_path = archive_path_old(doc)
|
||||
new_path = archive_path_new(doc)
|
||||
|
||||
if doc.id in affected_document_ids:
|
||||
old_path = archive_path_old(doc)
|
||||
# remove affected archive versions
|
||||
if os.path.isfile(old_path):
|
||||
os.unlink(old_path)
|
||||
else:
|
||||
# move unaffected archive versions
|
||||
if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path):
|
||||
logger.debug(
|
||||
f"Moving {old_path} to {new_path}"
|
||||
)
|
||||
shutil.move(old_path, new_path)
|
||||
# Set archive path for unaffected files
|
||||
doc.archive_filename = archive_path_old(doc)
|
||||
Document.objects.filter(id=doc.id).update(
|
||||
archive_filename=doc.archive_filename
|
||||
)
|
||||
|
||||
# regenerate archive documents
|
||||
for doc_id in affected_document_ids:
|
||||
@@ -135,14 +212,16 @@ def move_old_to_new_locations(apps, schema_editor):
|
||||
try:
|
||||
parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
|
||||
doc.content = parser.get_text()
|
||||
if parser.archive_path and os.path.isfile(parser.archive_path):
|
||||
with open(parser.archive_path, "rb") as f:
|
||||
|
||||
if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
|
||||
doc.archive_filename = generate_unique_filename(
|
||||
doc, archive_filename=True)
|
||||
with open(parser.get_archive_path(), "rb") as f:
|
||||
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
shutil.copy2(parser.archive_path, archive_path_new(doc))
|
||||
os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
|
||||
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
|
||||
else:
|
||||
doc.archive_checksum = None
|
||||
if os.path.isfile(archive_path_new(doc)):
|
||||
os.unlink(archive_path_new(doc))
|
||||
doc.save()
|
||||
except ParseError:
|
||||
logger.exception(
|
||||
@@ -187,8 +266,18 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='archive_filename',
|
||||
field=models.FilePathField(default=None, editable=False, help_text='Current archive filename in storage', max_length=1024, null=True, unique=True, verbose_name='archive filename'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='filename',
|
||||
field=models.FilePathField(default=None, editable=False, help_text='Current filename in storage', max_length=1024, null=True, unique=True, verbose_name='filename'),
|
||||
),
|
||||
migrations.RunPython(
|
||||
move_old_to_new_locations,
|
||||
move_new_to_old_locations
|
||||
)
|
||||
),
|
||||
]
|
||||
|
Reference in New Issue
Block a user