mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
427 lines
16 KiB
Python
427 lines
16 KiB
Python
import json
|
|
import logging
|
|
import os
|
|
from contextlib import contextmanager
|
|
from pathlib import Path
|
|
|
|
import tqdm
|
|
from django.conf import settings
|
|
from django.contrib.auth.models import Permission
|
|
from django.contrib.auth.models import User
|
|
from django.contrib.contenttypes.models import ContentType
|
|
from django.core.exceptions import FieldDoesNotExist
|
|
from django.core.management import call_command
|
|
from django.core.management.base import BaseCommand
|
|
from django.core.management.base import CommandError
|
|
from django.core.serializers.base import DeserializationError
|
|
from django.db import IntegrityError
|
|
from django.db import transaction
|
|
from django.db.models.signals import m2m_changed
|
|
from django.db.models.signals import post_save
|
|
from filelock import FileLock
|
|
|
|
from documents.file_handling import create_source_path_directory
|
|
from documents.management.commands.mixins import CryptMixin
|
|
from documents.models import Correspondent
|
|
from documents.models import CustomField
|
|
from documents.models import CustomFieldInstance
|
|
from documents.models import Document
|
|
from documents.models import DocumentType
|
|
from documents.models import Note
|
|
from documents.models import Tag
|
|
from documents.parsers import run_convert
|
|
from documents.settings import EXPORTER_ARCHIVE_NAME
|
|
from documents.settings import EXPORTER_CRYPTO_SETTINGS_NAME
|
|
from documents.settings import EXPORTER_FILE_NAME
|
|
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
|
from documents.signals.handlers import update_filename_and_move_files
|
|
from documents.utils import copy_file_with_basic_stats
|
|
from paperless import version
|
|
|
|
if settings.AUDIT_LOG_ENABLED:
|
|
from auditlog.registry import auditlog
|
|
|
|
|
|
@contextmanager
|
|
def disable_signal(sig, receiver, sender):
|
|
try:
|
|
sig.disconnect(receiver=receiver, sender=sender)
|
|
yield
|
|
finally:
|
|
sig.connect(receiver=receiver, sender=sender)
|
|
|
|
|
|
class Command(CryptMixin, BaseCommand):
|
|
help = (
|
|
"Using a manifest.json file, load the data from there, and import the "
|
|
"documents it refers to."
|
|
)
|
|
|
|
def add_arguments(self, parser):
|
|
parser.add_argument("source")
|
|
|
|
parser.add_argument(
|
|
"--no-progress-bar",
|
|
default=False,
|
|
action="store_true",
|
|
help="If set, the progress bar will not be shown",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--data-only",
|
|
default=False,
|
|
action="store_true",
|
|
help="If set, only the database will be exported, not files",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--passphrase",
|
|
help="If provided, is used to sensitive fields in the export",
|
|
)
|
|
|
|
def pre_check(self) -> None:
|
|
"""
|
|
Runs some initial checks against the state of the install and source, including:
|
|
- Does the target exist?
|
|
- Can we access the target?
|
|
- Does the target have a manifest file?
|
|
- Are there existing files in the document folders?
|
|
- Are there existing users or documents in the database?
|
|
"""
|
|
|
|
def pre_check_maybe_not_empty():
|
|
# Skip this check if operating only on the database
|
|
# We can expect data to exist in that case
|
|
if not self.data_only:
|
|
for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]:
|
|
if document_dir.exists() and document_dir.is_dir():
|
|
for entry in document_dir.glob("**/*"):
|
|
if entry.is_dir():
|
|
continue
|
|
self.stdout.write(
|
|
self.style.WARNING(
|
|
f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation",
|
|
),
|
|
)
|
|
break
|
|
# But existing users or other data still matters in a data only
|
|
if (
|
|
User.objects.exclude(username__in=["consumer", "AnonymousUser"]).count()
|
|
!= 0
|
|
):
|
|
self.stdout.write(
|
|
self.style.WARNING(
|
|
"Found existing user(s), this might indicate a non-empty installation",
|
|
),
|
|
)
|
|
if Document.objects.count() != 0:
|
|
self.stdout.write(
|
|
self.style.WARNING(
|
|
"Found existing documents(s), this might indicate a non-empty installation",
|
|
),
|
|
)
|
|
|
|
def pre_check_manifest_exists():
|
|
if not (self.source / "manifest.json").exists():
|
|
raise CommandError(
|
|
"That directory doesn't appear to contain a manifest.json file.",
|
|
)
|
|
|
|
if not self.source.exists():
|
|
raise CommandError("That path doesn't exist")
|
|
|
|
if not os.access(self.source, os.R_OK):
|
|
raise CommandError("That path doesn't appear to be readable")
|
|
|
|
pre_check_maybe_not_empty()
|
|
pre_check_manifest_exists()
|
|
|
|
def load_manifest_files(self) -> None:
|
|
"""
|
|
Loads manifest data from the various JSON files for parsing and loading the database
|
|
"""
|
|
main_manifest_path = self.source / "manifest.json"
|
|
|
|
with main_manifest_path.open() as infile:
|
|
self.manifest = json.load(infile)
|
|
self.manifest_paths.append(main_manifest_path)
|
|
|
|
for file in Path(self.source).glob("**/*-manifest.json"):
|
|
with file.open() as infile:
|
|
self.manifest += json.load(infile)
|
|
self.manifest_paths.append(file)
|
|
|
|
def load_metadata(self) -> None:
|
|
"""
|
|
Loads either just the version information or the version information and extra data
|
|
|
|
Must account for the old style of export as well, with just version.json
|
|
"""
|
|
version_path = self.source / "version.json"
|
|
metadata_path = self.source / "metadata.json"
|
|
if not version_path.exists() and not metadata_path.exists():
|
|
self.stdout.write(
|
|
self.style.NOTICE("No version.json or metadata.json file located"),
|
|
)
|
|
return
|
|
|
|
if metadata_path.exists():
|
|
with metadata_path.open() as infile:
|
|
data = json.load(infile)
|
|
self.version = data["version"]
|
|
if not self.passphrase and EXPORTER_CRYPTO_SETTINGS_NAME in data:
|
|
raise CommandError(
|
|
"No passphrase was given, but this export contains encrypted fields",
|
|
)
|
|
elif EXPORTER_CRYPTO_SETTINGS_NAME in data:
|
|
self.load_crypt_params(data)
|
|
elif version_path.exists():
|
|
with version_path.open() as infile:
|
|
self.version = json.load(infile)["version"]
|
|
|
|
if self.version and self.version != version.__full_version_str__:
|
|
self.stdout.write(
|
|
self.style.WARNING(
|
|
"Version mismatch: "
|
|
f"Currently {version.__full_version_str__},"
|
|
f" importing {self.version}."
|
|
" Continuing, but import may fail.",
|
|
),
|
|
)
|
|
|
|
def load_data_to_database(self) -> None:
|
|
"""
|
|
As the name implies, loads data from the JSON file(s) into the database
|
|
"""
|
|
try:
|
|
with transaction.atomic():
|
|
# delete these since pk can change, re-created from import
|
|
ContentType.objects.all().delete()
|
|
Permission.objects.all().delete()
|
|
for manifest_path in self.manifest_paths:
|
|
call_command("loaddata", manifest_path)
|
|
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
|
|
self.stdout.write(self.style.ERROR("Database import failed"))
|
|
if (
|
|
self.version is not None
|
|
and self.version != version.__full_version_str__
|
|
): # pragma: no cover
|
|
self.stdout.write(
|
|
self.style.ERROR(
|
|
"Version mismatch: "
|
|
f"Currently {version.__full_version_str__},"
|
|
f" importing {self.version}",
|
|
),
|
|
)
|
|
raise e
|
|
else:
|
|
self.stdout.write(
|
|
self.style.ERROR("No version information present"),
|
|
)
|
|
raise e
|
|
|
|
def handle(self, *args, **options):
|
|
logging.getLogger().handlers[0].level = logging.ERROR
|
|
|
|
self.source = Path(options["source"]).resolve()
|
|
self.data_only: bool = options["data_only"]
|
|
self.no_progress_bar: bool = options["no_progress_bar"]
|
|
self.passphrase: str | None = options.get("passphrase")
|
|
self.version: str | None = None
|
|
self.salt: str | None = None
|
|
self.manifest_paths = []
|
|
self.manifest = []
|
|
|
|
self.pre_check()
|
|
|
|
self.load_metadata()
|
|
|
|
self.load_manifest_files()
|
|
|
|
self.check_manifest_validity()
|
|
|
|
self.decrypt_secret_fields()
|
|
|
|
with (
|
|
disable_signal(
|
|
post_save,
|
|
receiver=update_filename_and_move_files,
|
|
sender=Document,
|
|
),
|
|
disable_signal(
|
|
m2m_changed,
|
|
receiver=update_filename_and_move_files,
|
|
sender=Document.tags.through,
|
|
),
|
|
):
|
|
if settings.AUDIT_LOG_ENABLED:
|
|
auditlog.unregister(Document)
|
|
auditlog.unregister(Correspondent)
|
|
auditlog.unregister(Tag)
|
|
auditlog.unregister(DocumentType)
|
|
auditlog.unregister(Note)
|
|
auditlog.unregister(CustomField)
|
|
auditlog.unregister(CustomFieldInstance)
|
|
|
|
# Fill up the database with whatever is in the manifest
|
|
self.load_data_to_database()
|
|
|
|
if not self.data_only:
|
|
self._import_files_from_manifest()
|
|
else:
|
|
self.stdout.write(self.style.NOTICE("Data only import completed"))
|
|
|
|
self.stdout.write("Updating search index...")
|
|
call_command(
|
|
"document_index",
|
|
"reindex",
|
|
no_progress_bar=self.no_progress_bar,
|
|
)
|
|
|
|
def check_manifest_validity(self):
|
|
"""
|
|
Attempts to verify the manifest is valid. Namely checking the files
|
|
referred to exist and the files can be read from
|
|
"""
|
|
|
|
def check_document_validity(document_record: dict):
|
|
if EXPORTER_FILE_NAME not in document_record:
|
|
raise CommandError(
|
|
"The manifest file contains a record which does not "
|
|
"refer to an actual document file.",
|
|
)
|
|
|
|
doc_file = document_record[EXPORTER_FILE_NAME]
|
|
doc_path: Path = self.source / doc_file
|
|
if not doc_path.exists():
|
|
raise CommandError(
|
|
f'The manifest file refers to "{doc_file}" which does not '
|
|
"appear to be in the source directory.",
|
|
)
|
|
try:
|
|
with doc_path.open(mode="rb"):
|
|
pass
|
|
except Exception as e:
|
|
raise CommandError(
|
|
f"Failed to read from original file {doc_path}",
|
|
) from e
|
|
|
|
if EXPORTER_ARCHIVE_NAME in document_record:
|
|
archive_file = document_record[EXPORTER_ARCHIVE_NAME]
|
|
doc_archive_path: Path = self.source / archive_file
|
|
if not doc_archive_path.exists():
|
|
raise CommandError(
|
|
f"The manifest file refers to {archive_file} which "
|
|
f"does not appear to be in the source directory.",
|
|
)
|
|
try:
|
|
with doc_archive_path.open(mode="rb"):
|
|
pass
|
|
except Exception as e:
|
|
raise CommandError(
|
|
f"Failed to read from archive file {doc_archive_path}",
|
|
) from e
|
|
|
|
self.stdout.write("Checking the manifest")
|
|
for record in self.manifest:
|
|
# Only check if the document files exist if this is not data only
|
|
# We don't care about documents for a data only import
|
|
if not self.data_only and record["model"] == "documents.document":
|
|
check_document_validity(record)
|
|
|
|
def _import_files_from_manifest(self):
|
|
settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
|
|
settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
|
|
settings.ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.stdout.write("Copy files into paperless...")
|
|
|
|
manifest_documents = list(
|
|
filter(lambda r: r["model"] == "documents.document", self.manifest),
|
|
)
|
|
|
|
for record in tqdm.tqdm(manifest_documents, disable=self.no_progress_bar):
|
|
document = Document.objects.get(pk=record["pk"])
|
|
|
|
doc_file = record[EXPORTER_FILE_NAME]
|
|
document_path = os.path.join(self.source, doc_file)
|
|
|
|
if EXPORTER_THUMBNAIL_NAME in record:
|
|
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
|
|
thumbnail_path = Path(os.path.join(self.source, thumb_file)).resolve()
|
|
else:
|
|
thumbnail_path = None
|
|
|
|
if EXPORTER_ARCHIVE_NAME in record:
|
|
archive_file = record[EXPORTER_ARCHIVE_NAME]
|
|
archive_path = os.path.join(self.source, archive_file)
|
|
else:
|
|
archive_path = None
|
|
|
|
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
|
|
|
with FileLock(settings.MEDIA_LOCK):
|
|
if os.path.isfile(document.source_path):
|
|
raise FileExistsError(document.source_path)
|
|
|
|
create_source_path_directory(document.source_path)
|
|
|
|
copy_file_with_basic_stats(document_path, document.source_path)
|
|
|
|
if thumbnail_path:
|
|
if thumbnail_path.suffix in {".png", ".PNG"}:
|
|
run_convert(
|
|
density=300,
|
|
scale="500x5000>",
|
|
alpha="remove",
|
|
strip=True,
|
|
trim=False,
|
|
auto_orient=True,
|
|
input_file=f"{thumbnail_path}[0]",
|
|
output_file=str(document.thumbnail_path),
|
|
)
|
|
else:
|
|
copy_file_with_basic_stats(
|
|
thumbnail_path,
|
|
document.thumbnail_path,
|
|
)
|
|
|
|
if archive_path:
|
|
create_source_path_directory(document.archive_path)
|
|
# TODO: this assumes that the export is valid and
|
|
# archive_filename is present on all documents with
|
|
# archived files
|
|
copy_file_with_basic_stats(archive_path, document.archive_path)
|
|
|
|
document.save()
|
|
|
|
def decrypt_secret_fields(self) -> None:
|
|
"""
|
|
The converse decryption of some fields out of the export before importing to database
|
|
"""
|
|
if self.passphrase:
|
|
# Salt has been loaded from metadata.json at this point, so it cannot be None
|
|
self.setup_crypto(passphrase=self.passphrase, salt=self.salt)
|
|
|
|
had_at_least_one_record = False
|
|
|
|
for crypt_config in self.CRYPT_FIELDS:
|
|
importer_model = crypt_config["model_name"]
|
|
crypt_fields = crypt_config["fields"]
|
|
for record in filter(
|
|
lambda x: x["model"] == importer_model,
|
|
self.manifest,
|
|
):
|
|
had_at_least_one_record = True
|
|
for field in crypt_fields:
|
|
record["fields"][field] = self.decrypt_string(
|
|
value=record["fields"][field],
|
|
)
|
|
|
|
if had_at_least_one_record:
|
|
# It's annoying, but the DB is loaded from the JSON directly
|
|
# Maybe could change that in the future?
|
|
(self.source / "manifest.json").write_text(
|
|
json.dumps(self.manifest, indent=2, ensure_ascii=False),
|
|
)
|