mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
parent
08199f09b6
commit
a68b858733
@ -1,15 +1,21 @@
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.core import serializers
|
||||
from django.core.management.base import BaseCommand, CommandError
|
||||
from django.db import transaction
|
||||
from filelock import FileLock
|
||||
|
||||
from documents.models import Document, Correspondent, Tag, DocumentType
|
||||
from documents.settings import EXPORTER_FILE_NAME, EXPORTER_THUMBNAIL_NAME, \
|
||||
EXPORTER_ARCHIVE_NAME
|
||||
from paperless.db import GnuPG
|
||||
from ...file_handling import generate_filename, delete_empty_directories
|
||||
from ...mixins import Renderable
|
||||
|
||||
|
||||
@ -24,13 +30,36 @@ class Command(Renderable, BaseCommand):
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument("target")
|
||||
|
||||
parser.add_argument(
|
||||
"--compare-checksums",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Compare file checksums when determining whether to export "
|
||||
"a file or not. If not specified, file size and time "
|
||||
"modified is used instead."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--use-filename-format",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Use PAPERLESS_FILENAME_FORMAT for storing files in the "
|
||||
"export directory, if configured."
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
self.target = None
|
||||
self.files_in_export_dir = []
|
||||
self.exported_files = []
|
||||
self.compare_checksums = False
|
||||
self.use_filename_format = False
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.target = options["target"]
|
||||
self.compare_checksums = options['compare_checksums']
|
||||
self.use_filename_format = options['use_filename_format']
|
||||
|
||||
if not os.path.exists(self.target):
|
||||
raise CommandError("That path doesn't exist")
|
||||
@ -38,52 +67,75 @@ class Command(Renderable, BaseCommand):
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
if os.listdir(self.target):
|
||||
raise CommandError("That directory is not empty.")
|
||||
|
||||
self.dump()
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
self.dump()
|
||||
|
||||
def dump(self):
|
||||
# 1. Take a snapshot of what files exist in the current export folder
|
||||
for root, dirs, files in os.walk(self.target):
|
||||
self.files_in_export_dir.extend(
|
||||
map(lambda f: os.path.abspath(os.path.join(root, f)), files)
|
||||
)
|
||||
|
||||
documents = Document.objects.all()
|
||||
document_map = {d.pk: d for d in documents}
|
||||
manifest = json.loads(serializers.serialize("json", documents))
|
||||
# 2. Create manifest, containing all correspondents, types, tags and
|
||||
# documents
|
||||
with transaction.atomic():
|
||||
manifest = json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
|
||||
for index, document_dict in enumerate(manifest):
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", Tag.objects.all()))
|
||||
|
||||
# Force output to unencrypted as that will be the current state.
|
||||
# The importer will make the decision to encrypt or not.
|
||||
manifest[index]["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", DocumentType.objects.all()))
|
||||
|
||||
documents = Document.objects.order_by("id")
|
||||
document_map = {d.pk: d for d in documents}
|
||||
document_manifest = json.loads(
|
||||
serializers.serialize("json", documents))
|
||||
manifest += document_manifest
|
||||
|
||||
# 3. Export files from each document
|
||||
for index, document_dict in tqdm.tqdm(enumerate(document_manifest),
|
||||
total=len(document_manifest)):
|
||||
# 3.1. store files unencrypted
|
||||
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED # NOQA: E501
|
||||
|
||||
document = document_map[document_dict["pk"]]
|
||||
|
||||
print(f"Exporting: {document}")
|
||||
|
||||
# 3.2. generate a unique filename
|
||||
filename_counter = 0
|
||||
while True:
|
||||
original_name = document.get_public_filename(
|
||||
counter=filename_counter)
|
||||
original_target = os.path.join(self.target, original_name)
|
||||
if self.use_filename_format:
|
||||
base_name = generate_filename(
|
||||
document, counter=filename_counter)
|
||||
else:
|
||||
base_name = document.get_public_filename(
|
||||
counter=filename_counter)
|
||||
|
||||
if not os.path.exists(original_target):
|
||||
if base_name not in self.exported_files:
|
||||
self.exported_files.append(base_name)
|
||||
break
|
||||
else:
|
||||
filename_counter += 1
|
||||
|
||||
thumbnail_name = original_name + "-thumbnail.png"
|
||||
thumbnail_target = os.path.join(self.target, thumbnail_name)
|
||||
|
||||
# 3.3. write filenames into manifest
|
||||
original_name = base_name
|
||||
original_target = os.path.join(self.target, original_name)
|
||||
document_dict[EXPORTER_FILE_NAME] = original_name
|
||||
|
||||
thumbnail_name = base_name + "-thumbnail.png"
|
||||
thumbnail_target = os.path.join(self.target, thumbnail_name)
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
if os.path.exists(document.archive_path):
|
||||
archive_name = document.get_public_filename(
|
||||
archive=True, counter=filename_counter, suffix="_archive")
|
||||
archive_name = base_name + "-archive.pdf"
|
||||
archive_target = os.path.join(self.target, archive_name)
|
||||
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
|
||||
else:
|
||||
archive_target = None
|
||||
|
||||
# 3.4. write files to target folder
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
|
||||
@ -100,21 +152,57 @@ class Command(Renderable, BaseCommand):
|
||||
f.write(GnuPG.decrypted(document.archive_path))
|
||||
os.utime(archive_target, times=(t, t))
|
||||
else:
|
||||
self.check_and_copy(document.source_path,
|
||||
document.checksum,
|
||||
original_target)
|
||||
|
||||
shutil.copy(document.source_path, original_target)
|
||||
shutil.copy(document.thumbnail_path, thumbnail_target)
|
||||
self.check_and_copy(document.thumbnail_path,
|
||||
None,
|
||||
thumbnail_target)
|
||||
|
||||
if archive_target:
|
||||
shutil.copy(document.archive_path, archive_target)
|
||||
self.check_and_copy(document.archive_path,
|
||||
document.archive_checksum,
|
||||
archive_target)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", Correspondent.objects.all()))
|
||||
# 4. write manifest to target forlder
|
||||
manifest_path = os.path.abspath(
|
||||
os.path.join(self.target, "manifest.json"))
|
||||
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", Tag.objects.all()))
|
||||
|
||||
manifest += json.loads(serializers.serialize(
|
||||
"json", DocumentType.objects.all()))
|
||||
|
||||
with open(os.path.join(self.target, "manifest.json"), "w") as f:
|
||||
with open(manifest_path, "w") as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
|
||||
if manifest_path in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(manifest_path)
|
||||
|
||||
# 5. Remove files which we did not explicitly export in this run
|
||||
for f in self.files_in_export_dir:
|
||||
os.remove(f)
|
||||
|
||||
delete_empty_directories(os.path.abspath(os.path.dirname(f)),
|
||||
os.path.abspath(self.target))
|
||||
|
||||
def check_and_copy(self, source, source_checksum, target):
|
||||
if os.path.abspath(target) in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(os.path.abspath(target))
|
||||
|
||||
perform_copy = False
|
||||
|
||||
if os.path.exists(target):
|
||||
source_stat = os.stat(source)
|
||||
target_stat = os.stat(target)
|
||||
if self.compare_checksums and source_checksum:
|
||||
with open(target, "rb") as f:
|
||||
target_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
perform_copy = target_checksum != source_checksum
|
||||
elif source_stat.st_mtime != target_stat.st_mtime:
|
||||
perform_copy = True
|
||||
elif source_stat.st_size != target_stat.st_size:
|
||||
perform_copy = True
|
||||
else:
|
||||
# Copy if it does not exist
|
||||
perform_copy = True
|
||||
|
||||
if perform_copy:
|
||||
os.makedirs(os.path.dirname(target), exist_ok=True)
|
||||
shutil.copy2(source, target)
|
||||
|
@ -148,10 +148,10 @@ class Command(Renderable, BaseCommand):
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
shutil.copy(document_path, document.source_path)
|
||||
shutil.copy(thumbnail_path, document.thumbnail_path)
|
||||
shutil.copy2(document_path, document.source_path)
|
||||
shutil.copy2(thumbnail_path, document.thumbnail_path)
|
||||
if archive_path:
|
||||
create_source_path_directory(document.archive_path)
|
||||
shutil.copy(archive_path, document.archive_path)
|
||||
shutil.copy2(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
|
BIN
src/documents/tests/samples/documents/originals/0000002.pdf
Normal file
BIN
src/documents/tests/samples/documents/originals/0000002.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/documents/originals/0000003.pdf
Normal file
BIN
src/documents/tests/samples/documents/originals/0000003.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/documents/thumbnails/0000002.png
Normal file
BIN
src/documents/tests/samples/documents/thumbnails/0000002.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.7 KiB |
BIN
src/documents/tests/samples/documents/thumbnails/0000003.png
Normal file
BIN
src/documents/tests/samples/documents/thumbnails/0000003.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 7.7 KiB |
@ -3,6 +3,8 @@ import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.test import TestCase, override_settings
|
||||
@ -15,49 +17,60 @@ from documents.tests.utils import DirectoriesMixin, paperless_environment
|
||||
|
||||
class TestExportImport(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(
|
||||
PASSPHRASE="test"
|
||||
)
|
||||
def test_exporter(self):
|
||||
def setUp(self) -> None:
|
||||
self.target = tempfile.mkdtemp()
|
||||
self.addCleanup(shutil.rmtree, self.target)
|
||||
|
||||
self.d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow1", filename="0000001.pdf", mime_type="application/pdf")
|
||||
self.d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow2", filename="0000002.pdf", mime_type="application/pdf")
|
||||
self.d3 = Document.objects.create(content="Content", checksum="d38d7ed02e988e072caf924e0f3fcb76", title="wow2", filename="0000003.pdf", mime_type="application/pdf")
|
||||
self.t1 = Tag.objects.create(name="t")
|
||||
self.dt1 = DocumentType.objects.create(name="dt")
|
||||
self.c1 = Correspondent.objects.create(name="c")
|
||||
|
||||
self.d1.tags.add(self.t1)
|
||||
self.d1.correspondent = self.c1
|
||||
self.d1.document_type = self.dt1
|
||||
self.d1.save()
|
||||
super(TestExportImport, self).setUp()
|
||||
|
||||
def _do_export(self, use_filename_format=False, compare_checksums=False):
|
||||
args = ['document_exporter', self.target]
|
||||
if use_filename_format:
|
||||
args += ["--use-filename-format"]
|
||||
if compare_checksums:
|
||||
args += ["--compare-checksums"]
|
||||
|
||||
call_command(*args)
|
||||
|
||||
with open(os.path.join(self.target, "manifest.json")) as f:
|
||||
manifest = json.load(f)
|
||||
|
||||
return manifest
|
||||
|
||||
def test_exporter(self, use_filename_format=False):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
file = os.path.join(self.dirs.originals_dir, "0000001.pdf")
|
||||
manifest = self._do_export(use_filename_format=use_filename_format)
|
||||
|
||||
d1 = Document.objects.create(content="Content", checksum="42995833e01aea9b3edee44bbfdd7ce1", archive_checksum="62acb0bcbfbcaa62ca6ad3668e4e404b", title="wow", filename="0000001.pdf", mime_type="application/pdf")
|
||||
d2 = Document.objects.create(content="Content", checksum="9c9691e51741c1f4f41a20896af31770", title="wow", filename="0000002.pdf.gpg", mime_type="application/pdf", storage_type=Document.STORAGE_TYPE_GPG)
|
||||
t1 = Tag.objects.create(name="t")
|
||||
dt1 = DocumentType.objects.create(name="dt")
|
||||
c1 = Correspondent.objects.create(name="c")
|
||||
self.assertEqual(len(manifest), 6)
|
||||
self.assertEqual(len(list(filter(lambda e: e['model'] == 'documents.document', manifest))), 3)
|
||||
|
||||
d1.tags.add(t1)
|
||||
d1.correspondents = c1
|
||||
d1.document_type = dt1
|
||||
d1.save()
|
||||
d2.save()
|
||||
|
||||
target = tempfile.mkdtemp()
|
||||
self.addCleanup(shutil.rmtree, target)
|
||||
|
||||
call_command('document_exporter', target)
|
||||
|
||||
with open(os.path.join(target, "manifest.json")) as f:
|
||||
manifest = json.load(f)
|
||||
|
||||
self.assertEqual(len(manifest), 5)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
for element in manifest:
|
||||
if element['model'] == 'documents.document':
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_FILE_NAME])
|
||||
fname = os.path.join(self.target, element[document_exporter.EXPORTER_FILE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
self.assertTrue(os.path.exists(os.path.join(target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, element[document_exporter.EXPORTER_THUMBNAIL_NAME])))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
self.assertEqual(checksum, element['fields']['checksum'])
|
||||
|
||||
if document_exporter.EXPORTER_ARCHIVE_NAME in element:
|
||||
fname = os.path.join(target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
|
||||
fname = os.path.join(self.target, element[document_exporter.EXPORTER_ARCHIVE_NAME])
|
||||
self.assertTrue(os.path.exists(fname))
|
||||
|
||||
with open(fname, "rb") as f:
|
||||
@ -65,24 +78,93 @@ class TestExportImport(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(checksum, element['fields']['archive_checksum'])
|
||||
|
||||
with paperless_environment() as dirs:
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
self.assertEqual(Document.objects.count(), 3)
|
||||
Document.objects.all().delete()
|
||||
Correspondent.objects.all().delete()
|
||||
DocumentType.objects.all().delete()
|
||||
Tag.objects.all().delete()
|
||||
self.assertEqual(Document.objects.count(), 0)
|
||||
|
||||
call_command('document_importer', target)
|
||||
self.assertEqual(Document.objects.count(), 2)
|
||||
call_command('document_importer', self.target)
|
||||
self.assertEqual(Document.objects.count(), 3)
|
||||
self.assertEqual(Tag.objects.count(), 1)
|
||||
self.assertEqual(Correspondent.objects.count(), 1)
|
||||
self.assertEqual(DocumentType.objects.count(), 1)
|
||||
self.assertEqual(Document.objects.get(id=self.d1.id).title, "wow1")
|
||||
self.assertEqual(Document.objects.get(id=self.d2.id).title, "wow2")
|
||||
self.assertEqual(Document.objects.get(id=self.d3.id).title, "wow2")
|
||||
messages = check_sanity()
|
||||
# everything is alright after the test
|
||||
self.assertEqual(len(messages), 0, str([str(m) for m in messages]))
|
||||
|
||||
@override_settings(
|
||||
PAPERLESS_FILENAME_FORMAT="{title}"
|
||||
)
|
||||
def test_exporter_with_filename_format(self):
|
||||
self.test_exporter()
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
with override_settings(PAPERLESS_FILENAME_FORMAT="{created_year}/{correspondent}/{title}"):
|
||||
self.test_exporter(use_filename_format=True)
|
||||
|
||||
def test_update_export_changed_time(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
self._do_export()
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
|
||||
self._do_export()
|
||||
m.assert_not_called()
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
Path(self.d1.source_path).touch()
|
||||
|
||||
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
|
||||
self._do_export()
|
||||
self.assertEqual(m.call_count, 1)
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
def test_update_export_changed_checksum(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
self._do_export()
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
|
||||
self._do_export()
|
||||
m.assert_not_called()
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
self.d2.checksum = "asdfasdgf3"
|
||||
self.d2.save()
|
||||
|
||||
with mock.patch("documents.management.commands.document_exporter.shutil.copy2") as m:
|
||||
self._do_export(compare_checksums=True)
|
||||
self.assertEqual(m.call_count, 1)
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
@override_settings(PAPERLESS_FILENAME_FORMAT="{title}/{correspondent}")
|
||||
def test_update_export_changed_location(self):
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(os.path.join(os.path.dirname(__file__), "samples", "documents"), os.path.join(self.dirs.media_dir, "documents"))
|
||||
|
||||
m = self._do_export(use_filename_format=True)
|
||||
self.assertTrue(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf")))
|
||||
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
self.d1.title = "new_title"
|
||||
self.d1.save()
|
||||
self._do_export(use_filename_format=True)
|
||||
self.assertFalse(os.path.isfile(os.path.join(self.target, "wow1", "c.pdf")))
|
||||
self.assertFalse(os.path.isdir(os.path.join(self.target, "wow1")))
|
||||
self.assertTrue(os.path.isfile(os.path.join(self.target, "new_title", "c.pdf")))
|
||||
self.assertTrue(os.path.exists(os.path.join(self.target, "manifest.json")))
|
||||
|
||||
def test_export_missing_files(self):
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user