Feature: Allow encrypting sensitive fields in export (#6927)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2026-01-24 22:39:02 -06:00 · 2024-06-09 07:41:18 -07:00
parent 6ddb62bf3f
commit d9002005b1
8 changed files with 583 additions and 120 deletions
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -3,6 +3,7 @@ import logging
 import os
 from contextlib import contextmanager
 from pathlib import Path
+from typing import Optional

 import tqdm
 from django.conf import settings
@@ -21,6 +22,7 @@ from django.db.models.signals import post_save
 from filelock import FileLock

 from documents.file_handling import create_source_path_directory
+from documents.management.commands.mixins import CryptMixin
 from documents.models import Correspondent
 from documents.models import CustomField
 from documents.models import CustomFieldInstance
@@ -30,6 +32,7 @@ from documents.models import Note
 from documents.models import Tag
 from documents.parsers import run_convert
 from documents.settings import EXPORTER_ARCHIVE_NAME
+from documents.settings import EXPORTER_CRYPTO_SETTINGS_NAME
 from documents.settings import EXPORTER_FILE_NAME
 from documents.settings import EXPORTER_THUMBNAIL_NAME
 from documents.signals.handlers import update_filename_and_move_files
@@ -49,7 +52,7 @@ def disable_signal(sig, receiver, sender):
        sig.connect(receiver=receiver, sender=sender)


-class Command(BaseCommand):
+class Command(CryptMixin, BaseCommand):
    help = (
        "Using a manifest.json file, load the data from there, and import the "
        "documents it refers to."
@@ -72,92 +75,173 @@ class Command(BaseCommand):
            help="If set, only the database will be exported, not files",
        )

+        parser.add_argument(
+            "--passphrase",
+            help="If provided, is used to sensitive fields in the export",
+        )
+
    def pre_check(self) -> None:
        """
-        Runs some initial checks against the source directory, including looking for
-        common mistakes like having files still and users other than expected
+        Runs some initial checks against the state of the install and source, including:
+        - Does the target exist?
+        - Can we access the target?
+        - Does the target have a manifest file?
+        - Are there existing files in the document folders?
+        - Are there existing users or documents in the database?
        """

+        def pre_check_maybe_not_empty():
+            # Skip this check if operating only on the database
+            # We can expect data to exist in that case
+            if not self.data_only:
+                for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]:
+                    if document_dir.exists() and document_dir.is_dir():
+                        for entry in document_dir.glob("**/*"):
+                            if entry.is_dir():
+                                continue
+                            self.stdout.write(
+                                self.style.WARNING(
+                                    f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation",
+                                ),
+                            )
+                            break
+            # But existing users or other data still matters in a data only
+            if (
+                User.objects.exclude(username__in=["consumer", "AnonymousUser"]).count()
+                != 0
+            ):
+                self.stdout.write(
+                    self.style.WARNING(
+                        "Found existing user(s), this might indicate a non-empty installation",
+                    ),
+                )
+            if Document.objects.count() != 0:
+                self.stdout.write(
+                    self.style.WARNING(
+                        "Found existing documents(s), this might indicate a non-empty installation",
+                    ),
+                )
+
+        def pre_check_manifest_exists():
+            if not (self.source / "manifest.json").exists():
+                raise CommandError(
+                    "That directory doesn't appear to contain a manifest.json file.",
+                )
+
        if not self.source.exists():
            raise CommandError("That path doesn't exist")

        if not os.access(self.source, os.R_OK):
            raise CommandError("That path doesn't appear to be readable")

-        # Skip this check if operating only on the database
-        # We can expect data to exist in that case
-        if not self.data_only:
-            for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]:
-                if document_dir.exists() and document_dir.is_dir():
-                    for entry in document_dir.glob("**/*"):
-                        if entry.is_dir():
-                            continue
-                        self.stdout.write(
-                            self.style.WARNING(
-                                f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation",
-                            ),
-                        )
-                        break
-        if (
-            User.objects.exclude(username__in=["consumer", "AnonymousUser"]).count()
-            != 0
-        ):
+        pre_check_maybe_not_empty()
+        pre_check_manifest_exists()
+
+    def load_manifest_files(self) -> None:
+        """
+        Loads manifest data from the various JSON files for parsing and loading the database
+        """
+        main_manifest_path = self.source / "manifest.json"
+
+        with main_manifest_path.open() as infile:
+            self.manifest = json.load(infile)
+        self.manifest_paths.append(main_manifest_path)
+
+        for file in Path(self.source).glob("**/*-manifest.json"):
+            with file.open() as infile:
+                self.manifest += json.load(infile)
+            self.manifest_paths.append(file)
+
+    def load_metadata(self) -> None:
+        """
+        Loads either just the version information or the version information and extra data
+
+        Must account for the old style of export as well, with just version.json
+        """
+        version_path = self.source / "version.json"
+        metadata_path = self.source / "metadata.json"
+        if not version_path.exists() and not metadata_path.exists():
+            self.stdout.write(
+                self.style.NOTICE("No version.json or metadata.json file located"),
+            )
+            return
+
+        if version_path.exists():
+            with version_path.open() as infile:
+                self.version = json.load(infile)["version"]
+        elif metadata_path.exists():
+            with metadata_path.open() as infile:
+                data = json.load(infile)
+                self.version = data["version"]
+                if not self.passphrase and EXPORTER_CRYPTO_SETTINGS_NAME in data:
+                    raise CommandError(
+                        "No passphrase was given, but this export contains encrypted fields",
+                    )
+                elif EXPORTER_CRYPTO_SETTINGS_NAME in data:
+                    self.load_crypt_params(data)
+
+        if self.version and self.version != version.__full_version_str__:
            self.stdout.write(
                self.style.WARNING(
-                    "Found existing user(s), this might indicate a non-empty installation",
-                ),
-            )
-        if Document.objects.count() != 0:
-            self.stdout.write(
-                self.style.WARNING(
-                    "Found existing documents(s), this might indicate a non-empty installation",
+                    "Version mismatch: "
+                    f"Currently {version.__full_version_str__},"
+                    f" importing {self.version}."
+                    " Continuing, but import may fail.",
                ),
            )

+    def load_data_to_database(self) -> None:
+        """
+        As the name implies, loads data from the JSON file(s) into the database
+        """
+        try:
+            with transaction.atomic():
+                # delete these since pk can change, re-created from import
+                ContentType.objects.all().delete()
+                Permission.objects.all().delete()
+                for manifest_path in self.manifest_paths:
+                    call_command("loaddata", manifest_path)
+        except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
+            self.stdout.write(self.style.ERROR("Database import failed"))
+            if (
+                self.version is not None
+                and self.version != version.__full_version_str__
+            ):  # pragma: no cover
+                self.stdout.write(
+                    self.style.ERROR(
+                        "Version mismatch: "
+                        f"Currently {version.__full_version_str__},"
+                        f" importing {self.version}",
+                    ),
+                )
+                raise e
+            else:
+                self.stdout.write(
+                    self.style.ERROR("No version information present"),
+                )
+                raise e
+
    def handle(self, *args, **options):
        logging.getLogger().handlers[0].level = logging.ERROR

        self.source = Path(options["source"]).resolve()
        self.data_only: bool = options["data_only"]
        self.no_progress_bar: bool = options["no_progress_bar"]
+        self.passphrase: str | None = options.get("passphrase")
+        self.version: Optional[str] = None
+        self.salt: Optional[str] = None
+        self.manifest_paths = []
+        self.manifest = []

        self.pre_check()

-        manifest_paths = []
+        self.load_metadata()

-        main_manifest_path = self.source / "manifest.json"
+        self.load_manifest_files()

-        self._check_manifest_exists(main_manifest_path)
+        self.check_manifest_validity()

-        with main_manifest_path.open() as infile:
-            self.manifest = json.load(infile)
-        manifest_paths.append(main_manifest_path)
-
-        for file in Path(self.source).glob("**/*-manifest.json"):
-            with file.open() as infile:
-                self.manifest += json.load(infile)
-            manifest_paths.append(file)
-
-        version_path = self.source / "version.json"
-        if version_path.exists():
-            with version_path.open() as infile:
-                self.version = json.load(infile)["version"]
-            # Provide an initial warning if needed to the user
-            if self.version != version.__full_version_str__:
-                self.stdout.write(
-                    self.style.WARNING(
-                        "Version mismatch: "
-                        f"Currently {version.__full_version_str__},"
-                        f" importing {self.version}."
-                        " Continuing, but import may fail.",
-                    ),
-                )
-
-        else:
-            self.stdout.write(self.style.NOTICE("No version.json file located"))
-
-        if not self.data_only:
-            self._check_manifest_files_valid()
+        self.decrypt_secret_fields()

        with (
            disable_signal(
@@ -181,32 +265,7 @@ class Command(BaseCommand):
                auditlog.unregister(CustomFieldInstance)

            # Fill up the database with whatever is in the manifest
-            try:
-                with transaction.atomic():
-                    # delete these since pk can change, re-created from import
-                    ContentType.objects.all().delete()
-                    Permission.objects.all().delete()
-                    for manifest_path in manifest_paths:
-                        call_command("loaddata", manifest_path)
-            except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
-                self.stdout.write(self.style.ERROR("Database import failed"))
-                if (
-                    self.version is not None
-                    and self.version != version.__full_version_str__
-                ):
-                    self.stdout.write(
-                        self.style.ERROR(
-                            "Version mismatch: "
-                            f"Currently {version.__full_version_str__},"
-                            f" importing {self.version}",
-                        ),
-                    )
-                    raise e
-                else:
-                    self.stdout.write(
-                        self.style.ERROR("No version information present"),
-                    )
-                    raise e
+            self.load_data_to_database()

            if not self.data_only:
                self._import_files_from_manifest()
@@ -220,30 +279,20 @@ class Command(BaseCommand):
            no_progress_bar=self.no_progress_bar,
        )

-    @staticmethod
-    def _check_manifest_exists(path: Path):
-        if not path.exists():
-            raise CommandError(
-                "That directory doesn't appear to contain a manifest.json file.",
-            )
-
-    def _check_manifest_files_valid(self):
+    def check_manifest_validity(self):
        """
        Attempts to verify the manifest is valid.  Namely checking the files
        referred to exist and the files can be read from
        """
-        self.stdout.write("Checking the manifest")
-        for record in self.manifest:
-            if record["model"] != "documents.document":
-                continue

-            if EXPORTER_FILE_NAME not in record:
+        def check_document_validity(document_record: dict):
+            if EXPORTER_FILE_NAME not in document_record:
                raise CommandError(
                    "The manifest file contains a record which does not "
                    "refer to an actual document file.",
                )

-            doc_file = record[EXPORTER_FILE_NAME]
+            doc_file = document_record[EXPORTER_FILE_NAME]
            doc_path: Path = self.source / doc_file
            if not doc_path.exists():
                raise CommandError(
@@ -258,8 +307,8 @@ class Command(BaseCommand):
                    f"Failed to read from original file {doc_path}",
                ) from e

-            if EXPORTER_ARCHIVE_NAME in record:
-                archive_file = record[EXPORTER_ARCHIVE_NAME]
+            if EXPORTER_ARCHIVE_NAME in document_record:
+                archive_file = document_record[EXPORTER_ARCHIVE_NAME]
                doc_archive_path: Path = self.source / archive_file
                if not doc_archive_path.exists():
                    raise CommandError(
@@ -274,6 +323,13 @@ class Command(BaseCommand):
                        f"Failed to read from archive file {doc_archive_path}",
                    ) from e

+        self.stdout.write("Checking the manifest")
+        for record in self.manifest:
+            # Only check if the document files exist if this is not data only
+            # We don't care about documents for a data only import
+            if not self.data_only and record["model"] == "documents.document":
+                check_document_validity(record)
+
    def _import_files_from_manifest(self):
        settings.ORIGINALS_DIR.mkdir(parents=True, exist_ok=True)
        settings.THUMBNAIL_DIR.mkdir(parents=True, exist_ok=True)
@@ -339,3 +395,33 @@ class Command(BaseCommand):
                    copy_file_with_basic_stats(archive_path, document.archive_path)

            document.save()
+
+    def decrypt_secret_fields(self) -> None:
+        """
+        The converse decryption of some fields out of the export before importing to database
+        """
+        if self.passphrase:
+            # Salt has been loaded from metadata.json at this point, so it cannot be None
+            self.setup_crypto(passphrase=self.passphrase, salt=self.salt)
+
+            had_at_least_one_record = False
+
+            for crypt_config in self.CRYPT_FIELDS:
+                importer_model = crypt_config["model_name"]
+                crypt_fields = crypt_config["fields"]
+                for record in filter(
+                    lambda x: x["model"] == importer_model,
+                    self.manifest,
+                ):
+                    had_at_least_one_record = True
+                    for field in crypt_fields:
+                        record["fields"][field] = self.decrypt_string(
+                            value=record["fields"][field],
+                        )
+
+            if had_at_least_one_record:
+                # It's annoying, but the DB is loaded from the JSON directly
+                # Maybe could change that in the future?
+                (self.source / "manifest.json").write_text(
+                    json.dumps(self.manifest, indent=2, ensure_ascii=False),
+                )