From 4cb4bd13ad774616ff5ec5704723715be9e99966 Mon Sep 17 00:00:00 2001 From: Matthieu Helleboid Date: Sat, 14 Jan 2023 11:54:54 +0100 Subject: [PATCH] add split-manifest option to administration exporter --- docs/administration.md | 10 ++++++--- .../management/commands/document_exporter.py | 22 ++++++++++++++++++- .../management/commands/document_importer.py | 22 +++++++++++++++---- .../tests/test_management_exporter.py | 20 +++++++++++++++++ 4 files changed, 66 insertions(+), 8 deletions(-) diff --git a/docs/administration.md b/docs/administration.md index 7cbcf95b9..024e4ea11 100644 --- a/docs/administration.md +++ b/docs/administration.md @@ -227,9 +227,10 @@ is not a TTY" errors. For example: `docker-compose exec -T webserver document_exporter ../export` ``` -document_exporter target [-c] [-f] [-p] [-d] [-na] [-nt] +document_exporter target [-sm] [-c] [-f] [-p] [-d] [-na] [-nt] optional arguments: +-sm, --split-manifest -c, --compare-checksums -f, --use-filename-format -p, --use-filename-prefix @@ -243,6 +244,9 @@ optional arguments: documents, thumbnails and a `manifest.json` file. The manifest contains all metadata from the database (correspondents, tags, etc). +If `-sm` or `--split-manifest` is provided, information about document +will be placed in individual json files. + When you use the provided docker compose script, specify `../export` as the target. This path inside the container is automatically mounted on your host on the folder `export`. @@ -279,8 +283,8 @@ paperless to use `PAPERLESS_FILENAME_FORMAT` for exported filenames instead, specify `--use-filename-format`. If `-p` or `--use-filename-format` is provided, Files will be exported -in dedicated folders according to their nature: `archive`, `originals` -or `thumbnails` +in dedicated folders according to their nature: `archive`, `originals`, +`thumbnails` or `json` !!! warning diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index b6070b93d..12828a7a0 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -53,6 +53,14 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument("target") + parser.add_argument( + "-sm", + "--split-manifest", + default=False, + action="store_true", + help="Export document information in individual manifest json files.", + ) + parser.add_argument( "-c", "--compare-checksums", @@ -125,6 +133,7 @@ class Command(BaseCommand): def __init__(self, *args, **kwargs): BaseCommand.__init__(self, *args, **kwargs) self.target: Path = None + self.split_manifest = None self.files_in_export_dir: Set[Path] = set() self.exported_files: List[Path] = [] self.compare_checksums = False @@ -137,6 +146,7 @@ class Command(BaseCommand): def handle(self, *args, **options): self.target = Path(options["target"]).resolve() + self.split_manifest = options["split_manifest"] self.compare_checksums = options["compare_checksums"] self.use_filename_format = options["use_filename_format"] self.use_filename_prefix = options["use_filename_prefix"] @@ -217,7 +227,8 @@ class Command(BaseCommand): documents = Document.objects.order_by("id") document_map = {d.pk: d for d in documents} document_manifest = json.loads(serializers.serialize("json", documents)) - manifest += document_manifest + if not self.split_manifest: + manifest += document_manifest manifest += json.loads( serializers.serialize("json", MailAccount.objects.all()), @@ -334,6 +345,15 @@ class Command(BaseCommand): archive_target, ) + if self.split_manifest: + manifest_name = base_name + "-manifest.json" + if self.use_filename_prefix: + manifest_name = os.path.join("json", manifest_name) + manifest_name = os.path.join(self.target, manifest_name) + os.makedirs(os.path.dirname(manifest_name), exist_ok=True) + with open(manifest_name, "w") as f: + json.dump([document_manifest[index]], f, indent=2) + # 4.1 write manifest to target folder manifest_path = (self.target / Path("manifest.json")).resolve() manifest_path.write_text(json.dumps(manifest, indent=2)) diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index b62159d54..873a5841d 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -72,11 +72,24 @@ class Command(BaseCommand): if not os.access(self.source, os.R_OK): raise CommandError("That path doesn't appear to be readable") - manifest_path = os.path.normpath(os.path.join(self.source, "manifest.json")) - self._check_manifest_exists(manifest_path) + manifest_paths = [] - with open(manifest_path) as f: + main_manifest_path = os.path.normpath( + os.path.join(self.source, "manifest.json"), + ) + self._check_manifest_exists(main_manifest_path) + + with open(main_manifest_path) as f: self.manifest = json.load(f) + manifest_paths.append(main_manifest_path) + + for root, dirs, files in os.walk(self.source): + for file in files: + if file.endswith("-manifest.json"): + doc_manifest_path = os.path.normpath(os.path.join(root, file)) + with open(doc_manifest_path) as f: + self.manifest += json.load(f) + manifest_paths.append(doc_manifest_path) version_path = os.path.normpath(os.path.join(self.source, "version.json")) if os.path.exists(version_path): @@ -109,7 +122,8 @@ class Command(BaseCommand): ): # Fill up the database with whatever is in the manifest try: - call_command("loaddata", manifest_path) + for manifest_path in manifest_paths: + call_command("loaddata", manifest_path) except (FieldDoesNotExist, DeserializationError) as e: self.stdout.write(self.style.ERROR("Database import failed")) if ( diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index 8bf8def39..9f75772a6 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -104,6 +104,7 @@ class TestExportImport(DirectoriesMixin, TestCase): delete=False, no_archive=False, no_thumbnail=False, + split_manifest=False, ): args = ["document_exporter", self.target] if use_filename_format: @@ -116,6 +117,8 @@ class TestExportImport(DirectoriesMixin, TestCase): args += ["--no-archive"] if no_thumbnail: args += ["--no-thumbnail"] + if split_manifest: + args += ["--split-manifest"] call_command(*args) @@ -563,3 +566,20 @@ class TestExportImport(DirectoriesMixin, TestCase): with paperless_environment() as dirs: call_command("document_importer", self.target) self.assertEqual(Document.objects.count(), 4) + + def test_split_manifest(self): + shutil.rmtree(os.path.join(self.dirs.media_dir, "documents")) + shutil.copytree( + os.path.join(os.path.dirname(__file__), "samples", "documents"), + os.path.join(self.dirs.media_dir, "documents"), + ) + + manifest = self._do_export(split_manifest=True) + has_document = False + for element in manifest: + has_document = has_document or element["model"] == "documents.document" + self.assertFalse(has_document) + + with paperless_environment() as dirs: + call_command("document_importer", self.target) + self.assertEqual(Document.objects.count(), 4)