mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-10 00:18:57 +00:00
migration for #511
This commit is contained in:
181
src/documents/migrations/1012_fix_archive_files.py
Normal file
181
src/documents/migrations/1012_fix_archive_files.py
Normal file
@@ -0,0 +1,181 @@
|
||||
# Generated by Django 3.1.6 on 2021-02-07 22:26
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
|
||||
def archive_name_from_filename_old(filename):
|
||||
return os.path.splitext(filename)[0] + ".pdf"
|
||||
|
||||
|
||||
def archive_path_old(doc):
|
||||
if doc.filename:
|
||||
fname = archive_name_from_filename_old(doc.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(doc.pk)
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def archive_name_from_filename_new(filename):
|
||||
name, ext = os.path.splitext(filename)
|
||||
if ext == ".pdf":
|
||||
return filename
|
||||
else:
|
||||
return filename + ".pdf"
|
||||
|
||||
|
||||
def archive_path_new(doc):
|
||||
if doc.filename:
|
||||
fname = archive_name_from_filename_new(doc.filename)
|
||||
else:
|
||||
fname = "{:07}.pdf".format(doc.pk)
|
||||
|
||||
return os.path.join(
|
||||
settings.ARCHIVE_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
STORAGE_TYPE_GPG = "gpg"
|
||||
|
||||
|
||||
def source_path(doc):
|
||||
if doc.filename:
|
||||
fname = str(doc.filename)
|
||||
else:
|
||||
fname = "{:07}{}".format(doc.pk, doc.file_type)
|
||||
if doc.storage_type == STORAGE_TYPE_GPG:
|
||||
fname += ".gpg" # pragma: no cover
|
||||
|
||||
return os.path.join(
|
||||
settings.ORIGINALS_DIR,
|
||||
fname
|
||||
)
|
||||
|
||||
|
||||
def move_old_to_new_locations(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
affected_document_ids = set()
|
||||
|
||||
old_archive_path_to_id = {}
|
||||
|
||||
# check for documents that have incorrect archive versions
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
old_path = archive_path_old(doc)
|
||||
|
||||
if not os.path.isfile(old_path):
|
||||
raise ValueError(
|
||||
f"Archived document of {doc.filename} does not exist at: "
|
||||
f"{old_path}")
|
||||
|
||||
if old_path in old_archive_path_to_id:
|
||||
affected_document_ids.add(doc.id)
|
||||
affected_document_ids.add(old_archive_path_to_id[old_path])
|
||||
else:
|
||||
old_archive_path_to_id[old_path] = doc.id
|
||||
|
||||
# check that we can regenerate these archive versions
|
||||
for doc_id in affected_document_ids:
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
|
||||
doc = Document.objects.get(id=doc_id)
|
||||
parser_class = get_parser_class_for_mime_type(doc.mime_type)
|
||||
if not parser_class:
|
||||
raise Exception(
|
||||
f"document {doc.filename} has an invalid archived document, "
|
||||
f"but no parsers are available. Cannot migrate.")
|
||||
|
||||
# move files
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
old_path = archive_path_old(doc)
|
||||
new_path = archive_path_new(doc)
|
||||
|
||||
if old_path != new_path and not os.path.isfile(new_path):
|
||||
logger.debug(
|
||||
f"Moving {old_path} to {new_path}"
|
||||
)
|
||||
shutil.move(old_path, new_path)
|
||||
|
||||
# regenerate archive documents
|
||||
for doc_id in affected_document_ids:
|
||||
from documents.parsers import get_parser_class_for_mime_type, \
|
||||
DocumentParser, \
|
||||
ParseError
|
||||
|
||||
doc = Document.objects.get(id=doc_id)
|
||||
logger.info(
|
||||
f"Regenerating archive document for {doc.filename}"
|
||||
)
|
||||
parser_class = get_parser_class_for_mime_type(doc.mime_type)
|
||||
parser: DocumentParser = parser_class(None, None)
|
||||
try:
|
||||
parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
|
||||
doc.content = parser.get_text()
|
||||
if parser.archive_path and os.path.isfile(parser.archive_path):
|
||||
with open(parser.archive_path, "rb") as f:
|
||||
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
shutil.copy2(parser.archive_path, archive_path_new(doc))
|
||||
else:
|
||||
doc.archive_checksum = None
|
||||
if os.path.isfile(archive_path_new(doc)):
|
||||
os.unlink(archive_path_new(doc))
|
||||
doc.save()
|
||||
except ParseError:
|
||||
logger.exception(
|
||||
f"Unable to regenerate archive document for {doc.filename}"
|
||||
)
|
||||
finally:
|
||||
parser.cleanup()
|
||||
|
||||
|
||||
def move_new_to_old_locations(apps, schema_editor):
|
||||
Document = apps.get_model("documents", "Document")
|
||||
|
||||
old_archive_paths = set()
|
||||
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
new_archive_path = archive_path_new(doc)
|
||||
old_archive_path = archive_path_old(doc)
|
||||
if old_archive_path in old_archive_paths:
|
||||
raise ValueError(
|
||||
f"Cannot migrate: Archive file name {old_archive_path} of "
|
||||
f"document {doc.filename} would clash with another archive "
|
||||
f"filename.")
|
||||
old_archive_paths.add(old_archive_path)
|
||||
if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
|
||||
raise ValueError(
|
||||
f"Cannot migrate: Cannot move {new_archive_path} to "
|
||||
f"{old_archive_path}: file already exists."
|
||||
)
|
||||
|
||||
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
||||
new_archive_path = archive_path_new(doc)
|
||||
old_archive_path = archive_path_old(doc)
|
||||
shutil.move(new_archive_path, old_archive_path)
|
||||
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '1011_auto_20210101_2340'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(
|
||||
move_old_to_new_locations,
|
||||
move_new_to_old_locations
|
||||
)
|
||||
]
|
Reference in New Issue
Block a user