Enhancement: Add --compare-json option to document_exporter to write json files only if changed (#8261)

This commit is contained in:
Kevin Doren 2024-11-19 07:20:24 -08:00 committed by GitHub
parent f0e71330ac
commit 827121808a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 63 additions and 20 deletions

View File

@ -241,6 +241,7 @@ document_exporter target [-c] [-d] [-f] [-na] [-nt] [-p] [-sm] [-z]
optional arguments:
-c, --compare-checksums
-cj, --compare-json
-d, --delete
-f, --use-filename-format
-na, --no-archive
@ -269,7 +270,8 @@ only export changed and added files. Paperless determines whether a file
has changed by inspecting the file attributes "date/time modified" and
"size". If that does not work out for you, specify `-c` or
`--compare-checksums` and paperless will attempt to compare file
checksums instead. This is slower.
checksums instead. This is slower. The manifest and metadata json files
are always updated, unless `cj` or `--compare-json` is specified.
Paperless will not remove any existing files in the export directory. If
you want paperless to also remove files that do not belong to the

View File

@ -82,6 +82,18 @@ class Command(CryptMixin, BaseCommand):
),
)
parser.add_argument(
"-cj",
"--compare-json",
default=False,
action="store_true",
help=(
"Compare json file checksums when determining whether to "
"export a json file or not (manifest or metadata). "
"If not specified, the file is always exported."
),
)
parser.add_argument(
"-d",
"--delete",
@ -178,6 +190,7 @@ class Command(CryptMixin, BaseCommand):
self.target = Path(options["target"]).resolve()
self.split_manifest: bool = options["split_manifest"]
self.compare_checksums: bool = options["compare_checksums"]
self.compare_json: bool = options["compare_json"]
self.use_filename_format: bool = options["use_filename_format"]
self.use_folder_prefix: bool = options["use_folder_prefix"]
self.delete: bool = options["delete"]
@ -343,12 +356,11 @@ class Command(CryptMixin, BaseCommand):
manifest_dict["custom_field_instances"],
),
)
manifest_name.write_text(
json.dumps(content, indent=2, ensure_ascii=False),
encoding="utf-8",
self.check_and_write_json(
content,
manifest_name,
)
if manifest_name in self.files_in_export_dir:
self.files_in_export_dir.remove(manifest_name)
# These were exported already
if self.split_manifest:
@ -361,12 +373,10 @@ class Command(CryptMixin, BaseCommand):
for key in manifest_dict:
manifest.extend(manifest_dict[key])
manifest_path = (self.target / "manifest.json").resolve()
manifest_path.write_text(
json.dumps(manifest, indent=2, ensure_ascii=False),
encoding="utf-8",
self.check_and_write_json(
manifest,
manifest_path,
)
if manifest_path in self.files_in_export_dir:
self.files_in_export_dir.remove(manifest_path)
# 4.2 write version information to target folder
extra_metadata_path = (self.target / "metadata.json").resolve()
@ -378,16 +388,11 @@ class Command(CryptMixin, BaseCommand):
# Django stores most of these in the field itself, we store them once here
if self.passphrase:
metadata.update(self.get_crypt_params())
extra_metadata_path.write_text(
json.dumps(
metadata,
indent=2,
ensure_ascii=False,
),
encoding="utf-8",
self.check_and_write_json(
metadata,
extra_metadata_path,
)
if extra_metadata_path in self.files_in_export_dir:
self.files_in_export_dir.remove(extra_metadata_path)
if self.delete:
# 5. Remove files which we did not explicitly export in this run
@ -516,6 +521,35 @@ class Command(CryptMixin, BaseCommand):
archive_target,
)
def check_and_write_json(
self,
content: list[dict] | dict,
target: Path,
):
"""
Writes the source content to the target json file.
If --compare-json arg was used, don't write to target file if
the file exists and checksum is identical to content checksum.
This preserves the file timestamps when no changes are made.
"""
target = target.resolve()
perform_write = True
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)
if self.compare_json:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
src_str = json.dumps(content, indent=2, ensure_ascii=False)
src_checksum = hashlib.md5(src_str.encode("utf-8")).hexdigest()
if src_checksum == target_checksum:
perform_write = False
if perform_write:
target.write_text(
json.dumps(content, indent=2, ensure_ascii=False),
encoding="utf-8",
)
def check_and_copy(
self,
source: Path,

View File

@ -153,6 +153,7 @@ class TestExportImport(
*,
use_filename_format=False,
compare_checksums=False,
compare_json=False,
delete=False,
no_archive=False,
no_thumbnail=False,
@ -165,6 +166,8 @@ class TestExportImport(
args += ["--use-filename-format"]
if compare_checksums:
args += ["--compare-checksums"]
if compare_json:
args += ["--compare-json"]
if delete:
args += ["--delete"]
if no_archive:
@ -340,6 +343,10 @@ class TestExportImport(
self.assertNotEqual(st_mtime_1, st_mtime_2)
self.assertNotEqual(st_mtime_2, st_mtime_3)
self._do_export(compare_json=True)
st_mtime_4 = os.stat(os.path.join(self.target, "manifest.json")).st_mtime
self.assertEqual(st_mtime_3, st_mtime_4)
def test_update_export_changed_checksum(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(