new exporter that updates the export in place, fixes

This commit is contained in:
jonaswinkler 2021-01-18 01:15:39 +01:00
parent 08199f09b6
commit a68b858733
7 changed files with 242 additions and 72 deletions

@ -1,15 +1,21 @@
import hashlib
import json import json
import os import os
import shutil import shutil
import time import time
import tqdm
from django.conf import settings
from django.core import serializers from django.core import serializers
from django.core.management.base import BaseCommand, CommandError from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from filelock import FileLock
from documents.models import Document, Correspondent, Tag, DocumentType from documents.models import Document, Correspondent, Tag, DocumentType
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \ from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
EXPORTER_ARCHIVE_NAME EXPORTER_ARCHIVE_NAME
from paperless.db import GnuPG from paperless.db import GnuPG
from ...file_handling import generate_filename, delete_empty_directories
from ...mixins import Renderable from ...mixins import Renderable
@ -24,13 +30,36 @@ class Command(Renderable, BaseCommand):
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument("target") parser.add_argument("target")
parser.add_argument(
"--compare-checksums",
default=False,
action="store_true",
help="Compare file checksums when determining whether to export "
"a file or not. If not specified, file size and time "
"modified is used instead."
)
parser.add_argument(
"--use-filename-format",
default=False,
action="store_true",
help="Use PAPERLESS_FILENAME_FORMAT for storing files in the "
"export directory, if configured."
)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs) BaseCommand.__init__(self, *args, **kwargs)
self.target = None self.target = None
self.files_in_export_dir = []
self.exported_files = []
self.compare_checksums = False
self.use_filename_format = False
def handle(self, *args, **options): def handle(self, *args, **options):
self.target = options["target"] self.target = options["target"]
self.compare_checksums = options['compare_checksums']
self.use_filename_format = options['use_filename_format']
if not os.path.exists(self.target): if not os.path.exists(self.target):
raise CommandError("That path doesn't exist") raise CommandError("That path doesn't exist")
@ -38,52 +67,75 @@ class Command(Renderable, BaseCommand):
if not os.access(self.target, os.W_OK): if not os.access(self.target, os.W_OK):
raise CommandError("That path doesn't appear to be writable") raise CommandError("That path doesn't appear to be writable")
if os.listdir(self.target): with FileLock(settings.MEDIA_LOCK):
raise CommandError("That directory is not empty.") self.dump()
self.dump()
def dump(self): def dump(self):
# 1. Take a snapshot of what files exist in the current export folder
for root, dirs, files in os.walk(self.target):
self.files_in_export_dir.extend(
map(lambda f: os.path.abspath(os.path.join(root, f)), files)
)
documents = Document.objects.all() # 2. Create manifest, containing all correspondents, types, tags and
document_map = {d.pk: d for d in documents} # documents
manifest = json.loads(serializers.serialize("json", documents)) with transaction.atomic():
manifest = json.loads(
serializers.serialize("json", Correspondent.objects.all()))
for index, document_dict in enumerate(manifest): manifest += json.loads(serializers.serialize(
"json", Tag.objects.all()))
# Force output to unencrypted as that will be the current state. manifest += json.loads(serializers.serialize(
# The importer will make the decision to encrypt or not. "json", DocumentType.objects.all()))
manifest[index]["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
documents = Document.objects.order_by("id")
document_map = {d.pk: d for d in documents}
document_manifest = json.loads(
serializers.serialize("json", documents))
manifest += document_manifest
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(enumerate(document_manifest),
total=len(document_manifest)):
# 3.1. store files unencrypted
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
document = document_map[document_dict["pk"]] document = document_map[document_dict["pk"]]
print(f"Exporting: {document}") # 3.2. generate a unique filename
filename_counter = 0 filename_counter = 0
while True: while True:
original_name = document.get_public_filename( if self.use_filename_format:
counter=filename_counter) base_name = generate_filename(
original_target = os.path.join(self.target, original_name) document, counter=filename_counter)
else:
base_name = document.get_public_filename(
counter=filename_counter)
if not os.path.exists(original_target): if base_name not in self.exported_files:
self.exported_files.append(base_name)
break break
else: else:
filename_counter += 1 filename_counter += 1
thumbnail_name = original_name + "-thumbnail.png" # 3.3. write filenames into manifest
thumbnail_target = os.path.join(self.target, thumbnail_name) original_name = base_name
original_target = os.path.join(self.target, original_name)
document_dict[EXPORTER_FILE_NAME] = original_name document_dict[EXPORTER_FILE_NAME] = original_name
thumbnail_name = base_name + "-thumbnail.png"
thumbnail_target = os.path.join(self.target, thumbnail_name)
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
if os.path.exists(document.archive_path): if os.path.exists(document.archive_path):
archive_name = document.get_public_filename( archive_name = base_name + "-archive.pdf"
archive=True, counter=filename_counter, suffix="_archive")
archive_target = os.path.join(self.target, archive_name) archive_target = os.path.join(self.target, archive_name)
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
else: else:
archive_target = None archive_target = None
# 3.4. write files to target folder
t = int(time.mktime(document.created.timetuple())) t = int(time.mktime(document.created.timetuple()))
if document.storage_type == Document.STORAGE_TYPE_GPG: if document.storage_type == Document.STORAGE_TYPE_GPG:
@ -100,21 +152,57 @@ class Command(Renderable, BaseCommand):
f.write(GnuPG.decrypted(document.archive_path)) f.write(GnuPG.decrypted(document.archive_path))
os.utime(archive_target, times=(t, t)) os.utime(archive_target, times=(t, t))
else: else:
self.check_and_copy(document.source_path,
document.checksum,
original_target)
shutil.copy(document.source_path, original_target) self.check_and_copy(document.thumbnail_path,
shutil.copy(document.thumbnail_path, thumbnail_target) None,
thumbnail_target)
if archive_target: if archive_target:
shutil.copy(document.archive_path, archive_target) self.check_and_copy(document.archive_path,
document.archive_checksum,
archive_target)
manifest += json.loads( # 4. write manifest to target forlder
serializers.serialize("json", Correspondent.objects.all())) manifest_path = os.path.abspath(
os.path.join(self.target, "manifest.json"))
manifest += json.loads(serializers.serialize( with open(manifest_path, "w") as f:
"json", Tag.objects.all()))
manifest += json.loads(serializers.serialize(
"json", DocumentType.objects.all()))
with open(os.path.join(self.target, "manifest.json"), "w") as f:
json.dump(manifest, f, indent=2) json.dump(manifest, f, indent=2)
if manifest_path in self.files_in_export_dir:
self.files_in_export_dir.remove(manifest_path)
# 5. Remove files which we did not explicitly export in this run
for f in self.files_in_export_dir:
os.remove(f)
delete_empty_directories(os.path.abspath(os.path.dirname(f)),
os.path.abspath(self.target))
def check_and_copy(self, source, source_checksum, target):
if os.path.abspath(target) in self.files_in_export_dir:
self.files_in_export_dir.remove(os.path.abspath(target))
perform_copy = False
if os.path.exists(target):
source_stat = os.stat(source)
target_stat = os.stat(target)
if self.compare_checksums and source_checksum:
with open(target, "rb") as f:
target_checksum = hashlib.md5(f.read()).hexdigest()
perform_copy = target_checksum != source_checksum
elif source_stat.st_mtime != target_stat.st_mtime:
perform_copy = True
elif source_stat.st_size != target_stat.st_size:
perform_copy = True
else:
# Copy if it does not exist
perform_copy = True
if perform_copy:
os.makedirs(os.path.dirname(target), exist_ok=True)
shutil.copy2(source, target)

@ -148,10 +148,10 @@ class Command(Renderable, BaseCommand):
create_source_path_directory(document.source_path) create_source_path_directory(document.source_path)
shutil.copy(document_path, document.source_path) shutil.copy2(document_path, document.source_path)
shutil.copy(thumbnail_path, document.thumbnail_path) shutil.copy2(thumbnail_path, document.thumbnail_path)
if archive_path: if archive_path:
create_source_path_directory(document.archive_path) create_source_path_directory(document.archive_path)
shutil.copy(archive_path, document.archive_path) shutil.copy2(archive_path, document.archive_path)
document.save() document.save()

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

(image error) Size: 7.7 KiB

Binary file not shown.

After

(image error) Size: 7.7 KiB

@ -3,6 +3,8 @@ import json
import os import os
import shutil import shutil
import tempfile import tempfile
from pathlib import Path
from unittest import mock
from django.core.management import call_command from django.core.management import call_command
from django.test import TestCase, override_settings from django.test import TestCase, override_settings
@ -15,49 +17,60 @@ from documents.tests.utils import DirectoriesMixin, paperless_environment
class TestExportImport(DirectoriesMixin, TestCase): class TestExportImport(DirectoriesMixin, TestCase):
@override_settings( def setUp(self) -> None:
PASSPHRASE="test" self.target = tempfile.mkdtemp()
) self.addCleanup(shutil.rmtree, self.target)
def test_exporter(self):
self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf")
self.d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow2", filename="0000002.pdf", mime_type="application/pdf")
self.d3 = Document.objects.create(content="Content", checksum="d38d7ed02e988e072caf924e0f3fcb76", title="wow2", filename="0000003.pdf", mime_type="application/pdf")
self.t1 = Tag.objects.create(name="t")
self.dt1 = DocumentType.objects.create(name="dt")
self.c1 = Correspondent.objects.create(name="c")
self.d1.tags.add(self.t1)
self.d1.correspondent = self.c1
self.d1.document_type = self.dt1
self.d1.save()
super(TestExportImport, self).setUp()
def _do_export(self, use_filename_format=False, compare_checksums=False):
args = ['document_exporter', self.target]
if use_filename_format:
args += ["--use-filename-format"]
if compare_checksums:
args += ["--compare-checksums"]
call_command(*args)
with open(os.path.join(self.target, "manifest.json")) as f:
manifest = json.load(f)
return manifest
def test_exporter(self, use_filename_format=False):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents")) shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents")) shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
file = os.path.join(self.dirs.originals_dir, "0000001.pdf") manifest = self._do_export(use_filename_format=use_filename_format)
d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", mime_type="application/pdf") self.assertEqual(len(manifest), 6)
d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG) self.assertEqual(len(list(filter(lambda e: e['model'] == 'documents.document', manifest))), 3)
t1 = Tag.objects.create(name="t")
dt1 = DocumentType.objects.create(name="dt")
c1 = Correspondent.objects.create(name="c")
d1.tags.add(t1) self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
d1.correspondents = c1
d1.document_type = dt1
d1.save()
d2.save()
target = tempfile.mkdtemp()
self.addCleanup(shutil.rmtree, target)
call_command('document_exporter', target)
with open(os.path.join(target, "manifest.json")) as f:
manifest = json.load(f)
self.assertEqual(len(manifest), 5)
for element in manifest: for element in manifest:
if element['model'] == 'documents.document': if element['model'] == 'documents.document':
fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME]) fname = os.path.join(self.target, element[document_exporter.EXPORTER_FILE_NAME])
self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname))
self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME]))) self.assertTrue(os.path.exists(os.path.join(self.target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
with open(fname, "rb") as f: with open(fname, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest() checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(checksum, element['fields']['checksum']) self.assertEqual(checksum, element['fields']['checksum'])
if document_exporter.EXPORTER_ARCHIVE_NAME in element: if document_exporter.EXPORTER_ARCHIVE_NAME in element:
fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME]) fname = os.path.join(self.target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname))
with open(fname, "rb") as f: with open(fname, "rb") as f:
@ -65,24 +78,93 @@ class TestExportImport(DirectoriesMixin, TestCase):
self.assertEqual(checksum, element['fields']['archive_checksum']) self.assertEqual(checksum, element['fields']['archive_checksum'])
with paperless_environment() as dirs: with paperless_environment() as dirs:
self.assertEqual(Document.objects.count(), 2) self.assertEqual(Document.objects.count(), 3)
Document.objects.all().delete() Document.objects.all().delete()
Correspondent.objects.all().delete() Correspondent.objects.all().delete()
DocumentType.objects.all().delete() DocumentType.objects.all().delete()
Tag.objects.all().delete() Tag.objects.all().delete()
self.assertEqual(Document.objects.count(), 0) self.assertEqual(Document.objects.count(), 0)
call_command('document_importer', target) call_command('document_importer', self.target)
self.assertEqual(Document.objects.count(), 2) self.assertEqual(Document.objects.count(), 3)
self.assertEqual(Tag.objects.count(), 1)
self.assertEqual(Correspondent.objects.count(), 1)
self.assertEqual(DocumentType.objects.count(), 1)
self.assertEqual(Document.objects.get(id=self.d1.id).title, "wow1")
self.assertEqual(Document.objects.get(id=self.d2.id).title, "wow2")
self.assertEqual(Document.objects.get(id=self.d3.id).title, "wow2")
messages = check_sanity() messages = check_sanity()
# everything is alright after the test # everything is alright after the test
self.assertEqual(len(messages), 0, str([str(m) for m in messages])) self.assertEqual(len(messages), 0, str([str(m) for m in messages]))
@override_settings(
PAPERLESS_FILENAME_FORMAT="{title}"
)
def test_exporter_with_filename_format(self): def test_exporter_with_filename_format(self):
self.test_exporter() shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
with override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}/{correspondent}/{title}"):
self.test_exporter(use_filename_format=True)
def test_update_export_changed_time(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
self._do_export()
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
self._do_export()
m.assert_not_called()
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
Path(self.d1.source_path).touch()
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
self._do_export()
self.assertEqual(m.call_count, 1)
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
def test_update_export_changed_checksum(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
self._do_export()
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
self._do_export()
m.assert_not_called()
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
self.d2.checksum = "asdfasdgf3"
self.d2.save()
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
self._do_export(compare_checksums=True)
self.assertEqual(m.call_count, 1)
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}/{correspondent}")
def test_update_export_changed_location(self):
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
m = self._do_export(use_filename_format=True)
self.assertTrue(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf")))
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
self.d1.title = "new_title"
self.d1.save()
self._do_export(use_filename_format=True)
self.assertFalse(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf")))
self.assertFalse(os.path.isdir(os.path.join(self.target, "wow1")))
self.assertTrue(os.path.isfile(os.path.join(self.target, "new_title", "c.pdf")))
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
def test_export_missing_files(self): def test_export_missing_files(self):