diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index a7a17f124..e2313e86a 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -1,15 +1,21 @@ +import hashlib import json import os import shutil import time +import tqdm +from django.conf import settings from django.core import serializers from django.core.management.base import BaseCommand, CommandError +from django.db import transaction +from filelock import FileLock from documents.models import Document, Correspondent, Tag, DocumentType from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ EXPORTER_ARCHIVE_NAME from paperless.db import GnuPG +from ...file_handling import generate_filename, delete_empty_directories from ...mixins import Renderable @@ -24,13 +30,36 @@ class Command(Renderable, BaseCommand): def add_arguments(self, parser): parser.add_argument("target") + parser.add_argument( + "--compare-checksums", + default=False, + action="store_true", + help="Compare file checksums when determining whether to export " + "a file or not. If not specified, file size and time " + "modified is used instead." + ) + + parser.add_argument( + "--use-filename-format", + default=False, + action="store_true", + help="Use PAPERLESS_FILENAME_FORMAT for storing files in the " + "export directory, if configured." + ) + def __init__(self, *args, **kwargs): BaseCommand.__init__(self, *args, **kwargs) self.target = None + self.files_in_export_dir = [] + self.exported_files = [] + self.compare_checksums = False + self.use_filename_format = False def handle(self, *args, **options): self.target = options["target"] + self.compare_checksums = options['compare_checksums'] + self.use_filename_format = options['use_filename_format'] if not os.path.exists(self.target): raise CommandError("That path doesn't exist") @@ -38,52 +67,75 @@ class Command(Renderable, BaseCommand): if not os.access(self.target, os.W_OK): raise CommandError("That path doesn't appear to be writable") - if os.listdir(self.target): - raise CommandError("That directory is not empty.") - - self.dump() + with FileLock(settings.MEDIA_LOCK): + self.dump() def dump(self): + # 1. Take a snapshot of what files exist in the current export folder + for root, dirs, files in os.walk(self.target): + self.files_in_export_dir.extend( + map(lambda f: os.path.abspath(os.path.join(root, f)), files) + ) - documents = Document.objects.all() - document_map = {d.pk: d for d in documents} - manifest = json.loads(serializers.serialize("json", documents)) + # 2. Create manifest, containing all correspondents, types, tags and + # documents + with transaction.atomic(): + manifest = json.loads( + serializers.serialize("json", Correspondent.objects.all())) - for index, document_dict in enumerate(manifest): + manifest += json.loads(serializers.serialize( + "json", Tag.objects.all())) - # Force output to unencrypted as that will be the current state. - # The importer will make the decision to encrypt or not. - manifest[index]["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501 + manifest += json.loads(serializers.serialize( + "json", DocumentType.objects.all())) + + documents = Document.objects.order_by("id") + document_map = {d.pk: d for d in documents} + document_manifest = json.loads( + serializers.serialize("json", documents)) + manifest += document_manifest + + # 3. Export files from each document + for index, document_dict in tqdm.tqdm(enumerate(document_manifest), + total=len(document_manifest)): + # 3.1. store files unencrypted + document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501 document = document_map[document_dict["pk"]] - print(f"Exporting: {document}") - + # 3.2. generate a unique filename filename_counter = 0 while True: - original_name = document.get_public_filename( - counter=filename_counter) - original_target = os.path.join(self.target, original_name) + if self.use_filename_format: + base_name = generate_filename( + document, counter=filename_counter) + else: + base_name = document.get_public_filename( + counter=filename_counter) - if not os.path.exists(original_target): + if base_name not in self.exported_files: + self.exported_files.append(base_name) break else: filename_counter += 1 - thumbnail_name = original_name + "-thumbnail.png" - thumbnail_target = os.path.join(self.target, thumbnail_name) - + # 3.3. write filenames into manifest + original_name = base_name + original_target = os.path.join(self.target, original_name) document_dict[EXPORTER_FILE_NAME] = original_name + + thumbnail_name = base_name + "-thumbnail.png" + thumbnail_target = os.path.join(self.target, thumbnail_name) document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name if os.path.exists(document.archive_path): - archive_name = document.get_public_filename( - archive=True, counter=filename_counter, suffix="_archive") + archive_name = base_name + "-archive.pdf" archive_target = os.path.join(self.target, archive_name) document_dict[EXPORTER_ARCHIVE_NAME] = archive_name else: archive_target = None + # 3.4. write files to target folder t = int(time.mktime(document.created.timetuple())) if document.storage_type == Document.STORAGE_TYPE_GPG: @@ -100,21 +152,57 @@ class Command(Renderable, BaseCommand): f.write(GnuPG.decrypted(document.archive_path)) os.utime(archive_target, times=(t, t)) else: + self.check_and_copy(document.source_path, + document.checksum, + original_target) - shutil.copy(document.source_path, original_target) - shutil.copy(document.thumbnail_path, thumbnail_target) + self.check_and_copy(document.thumbnail_path, + None, + thumbnail_target) if archive_target: - shutil.copy(document.archive_path, archive_target) + self.check_and_copy(document.archive_path, + document.archive_checksum, + archive_target) - manifest += json.loads( - serializers.serialize("json", Correspondent.objects.all())) + # 4. write manifest to target forlder + manifest_path = os.path.abspath( + os.path.join(self.target, "manifest.json")) - manifest += json.loads(serializers.serialize( - "json", Tag.objects.all())) - - manifest += json.loads(serializers.serialize( - "json", DocumentType.objects.all())) - - with open(os.path.join(self.target, "manifest.json"), "w") as f: + with open(manifest_path, "w") as f: json.dump(manifest, f, indent=2) + + if manifest_path in self.files_in_export_dir: + self.files_in_export_dir.remove(manifest_path) + + # 5. Remove files which we did not explicitly export in this run + for f in self.files_in_export_dir: + os.remove(f) + + delete_empty_directories(os.path.abspath(os.path.dirname(f)), + os.path.abspath(self.target)) + + def check_and_copy(self, source, source_checksum, target): + if os.path.abspath(target) in self.files_in_export_dir: + self.files_in_export_dir.remove(os.path.abspath(target)) + + perform_copy = False + + if os.path.exists(target): + source_stat = os.stat(source) + target_stat = os.stat(target) + if self.compare_checksums and source_checksum: + with open(target, "rb") as f: + target_checksum = hashlib.md5(f.read()).hexdigest() + perform_copy = target_checksum != source_checksum + elif source_stat.st_mtime != target_stat.st_mtime: + perform_copy = True + elif source_stat.st_size != target_stat.st_size: + perform_copy = True + else: + # Copy if it does not exist + perform_copy = True + + if perform_copy: + os.makedirs(os.path.dirname(target), exist_ok=True) + shutil.copy2(source, target) diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index 6df14a82c..a2e19e3cc 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -148,10 +148,10 @@ class Command(Renderable, BaseCommand): create_source_path_directory(document.source_path) - shutil.copy(document_path, document.source_path) - shutil.copy(thumbnail_path, document.thumbnail_path) + shutil.copy2(document_path, document.source_path) + shutil.copy2(thumbnail_path, document.thumbnail_path) if archive_path: create_source_path_directory(document.archive_path) - shutil.copy(archive_path, document.archive_path) + shutil.copy2(archive_path, document.archive_path) document.save() diff --git a/src/documents/tests/samples/documents/originals/0000002.pdf b/src/documents/tests/samples/documents/originals/0000002.pdf new file mode 100644 index 000000000..5e75266ca Binary files /dev/null and b/src/documents/tests/samples/documents/originals/0000002.pdf differ diff --git a/src/documents/tests/samples/documents/originals/0000003.pdf b/src/documents/tests/samples/documents/originals/0000003.pdf new file mode 100644 index 000000000..afbeef5c8 Binary files /dev/null and b/src/documents/tests/samples/documents/originals/0000003.pdf differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000002.png b/src/documents/tests/samples/documents/thumbnails/0000002.png new file mode 100644 index 000000000..a3a768401 Binary files /dev/null and b/src/documents/tests/samples/documents/thumbnails/0000002.png differ diff --git a/src/documents/tests/samples/documents/thumbnails/0000003.png b/src/documents/tests/samples/documents/thumbnails/0000003.png new file mode 100644 index 000000000..a3a768401 Binary files /dev/null and b/src/documents/tests/samples/documents/thumbnails/0000003.png differ diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index d6ab7eadd..d6e7ad6e0 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -3,6 +3,8 @@ import json import os import shutil import tempfile +from pathlib import Path +from unittest import mock from django.core.management import call_command from django.test import TestCase, override_settings @@ -15,49 +17,60 @@ from documents.tests.utils import DirectoriesMixin, paperless_environment class TestExportImport(DirectoriesMixin, TestCase): - @override_settings( - PASSPHRASE="test" - ) - def test_exporter(self): + def setUp(self) -> None: + self.target = tempfile.mkdtemp() + self.addCleanup(shutil.rmtree, self.target) + + self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf") + self.d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow2", filename="0000002.pdf", mime_type="application/pdf") + self.d3 = Document.objects.create(content="Content", checksum="d38d7ed02e988e072caf924e0f3fcb76", title="wow2", filename="0000003.pdf", mime_type="application/pdf") + self.t1 = Tag.objects.create(name="t") + self.dt1 = DocumentType.objects.create(name="dt") + self.c1 = Correspondent.objects.create(name="c") + + self.d1.tags.add(self.t1) + self.d1.correspondent = self.c1 + self.d1.document_type = self.dt1 + self.d1.save() + super(TestExportImport, self).setUp() + + def _do_export(self, use_filename_format=False, compare_checksums=False): + args = ['document_exporter', self.target] + if use_filename_format: + args += ["--use-filename-format"] + if compare_checksums: + args += ["--compare-checksums"] + + call_command(*args) + + with open(os.path.join(self.target, "manifest.json")) as f: + manifest = json.load(f) + + return manifest + + def test_exporter(self, use_filename_format=False): shutil.rmtree(os.path.join(self.dirs.media_dir, "documents")) shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents")) - file = os.path.join(self.dirs.originals_dir, "0000001.pdf") + manifest = self._do_export(use_filename_format=use_filename_format) - d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", mime_type="application/pdf") - d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) - t1 = Tag.objects.create(name="t") - dt1 = DocumentType.objects.create(name="dt") - c1 = Correspondent.objects.create(name="c") + self.assertEqual(len(manifest), 6) + self.assertEqual(len(list(filter(lambda e: e['model'] == 'documents.document', manifest))), 3) - d1.tags.add(t1) - d1.correspondents = c1 - d1.document_type = dt1 - d1.save() - d2.save() - - target = tempfile.mkdtemp() - self.addCleanup(shutil.rmtree, target) - - call_command('document_exporter', target) - - with open(os.path.join(target, "manifest.json")) as f: - manifest = json.load(f) - - self.assertEqual(len(manifest), 5) + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) for element in manifest: if element['model'] == 'documents.document': - fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME]) + fname = os.path.join(self.target, element[document_exporter.EXPORTER_FILE_NAME]) self.assertTrue(os.path.exists(fname)) - self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME]))) + self.assertTrue(os.path.exists(os.path.join(self.target, element[document_exporter.EXPORTER_THUMBNAIL_NAME]))) with open(fname, "rb") as f: checksum = hashlib.md5(f.read()).hexdigest() self.assertEqual(checksum, element['fields']['checksum']) if document_exporter.EXPORTER_ARCHIVE_NAME in element: - fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME]) + fname = os.path.join(self.target, element[document_exporter.EXPORTER_ARCHIVE_NAME]) self.assertTrue(os.path.exists(fname)) with open(fname, "rb") as f: @@ -65,24 +78,93 @@ class TestExportImport(DirectoriesMixin, TestCase): self.assertEqual(checksum, element['fields']['archive_checksum']) with paperless_environment() as dirs: - self.assertEqual(Document.objects.count(), 2) + self.assertEqual(Document.objects.count(), 3) Document.objects.all().delete() Correspondent.objects.all().delete() DocumentType.objects.all().delete() Tag.objects.all().delete() self.assertEqual(Document.objects.count(), 0) - call_command('document_importer', target) - self.assertEqual(Document.objects.count(), 2) + call_command('document_importer', self.target) + self.assertEqual(Document.objects.count(), 3) + self.assertEqual(Tag.objects.count(), 1) + self.assertEqual(Correspondent.objects.count(), 1) + self.assertEqual(DocumentType.objects.count(), 1) + self.assertEqual(Document.objects.get(id=self.d1.id).title, "wow1") + self.assertEqual(Document.objects.get(id=self.d2.id).title, "wow2") + self.assertEqual(Document.objects.get(id=self.d3.id).title, "wow2") messages = check_sanity() # everything is alright after the test self.assertEqual(len(messages), 0, str([str(m) for m in messages])) - @override_settings( - PAPERLESS_FILENAME_FORMAT="{title}" - ) def test_exporter_with_filename_format(self): - self.test_exporter() + shutil.rmtree(os.path.join(self.dirs.media_dir, "documents")) + shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents")) + + with override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}/{correspondent}/{title}"): + self.test_exporter(use_filename_format=True) + + def test_update_export_changed_time(self): + shutil.rmtree(os.path.join(self.dirs.media_dir, "documents")) + shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents")) + + self._do_export() + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) + + with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m: + self._do_export() + m.assert_not_called() + + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) + + Path(self.d1.source_path).touch() + + with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m: + self._do_export() + self.assertEqual(m.call_count, 1) + + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) + + def test_update_export_changed_checksum(self): + shutil.rmtree(os.path.join(self.dirs.media_dir, "documents")) + shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents")) + + self._do_export() + + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) + + with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m: + self._do_export() + m.assert_not_called() + + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) + + self.d2.checksum = "asdfasdgf3" + self.d2.save() + + with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m: + self._do_export(compare_checksums=True) + self.assertEqual(m.call_count, 1) + + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) + + @override_settings(PAPERLESS_FILENAME_FORMAT="{title}/{correspondent}") + def test_update_export_changed_location(self): + shutil.rmtree(os.path.join(self.dirs.media_dir, "documents")) + shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents")) + + m = self._do_export(use_filename_format=True) + self.assertTrue(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf"))) + + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) + + self.d1.title = "new_title" + self.d1.save() + self._do_export(use_filename_format=True) + self.assertFalse(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf"))) + self.assertFalse(os.path.isdir(os.path.join(self.target, "wow1"))) + self.assertTrue(os.path.isfile(os.path.join(self.target, "new_title", "c.pdf"))) + self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json"))) def test_export_missing_files(self):