271 lines
9.6 KiB
Python

import json
import logging
import os
import shutil
from contextlib import contextmanager
from pathlib import Path
import tqdm
from django.conf import settings
from django.core.exceptions import FieldDoesNotExist
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.core.serializers.base import DeserializationError
from django.db.models.signals import m2m_changed
from django.db.models.signals import post_save
from documents.models import Document
from documents.parsers import run_convert
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from filelock import FileLock
from paperless import version
from documents.file_handling import create_source_path_directory
from documents.signals.handlers import update_filename_and_move_files
@contextmanager
def disable_signal(sig, receiver, sender):
try:
sig.disconnect(receiver=receiver, sender=sender)
yield
finally:
sig.connect(receiver=receiver, sender=sender)
class Command(BaseCommand):
help = """
Using a manifest.json file, load the data from there, and import the
documents it refers to.
""".replace(
" ",
"",
)
def add_arguments(self, parser):
parser.add_argument("source")
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
self.source = None
self.manifest = None
self.version = None
def handle(self, *args, **options):
logging.getLogger().handlers[0].level = logging.ERROR
self.source = Path(options["source"]).resolve()
if not self.source.exists():
raise CommandError("That path doesn't exist")
if not os.access(self.source, os.R_OK):
raise CommandError("That path doesn't appear to be readable")
manifest_paths = []
main_manifest_path = self.source / "manifest.json"
self._check_manifest_exists(main_manifest_path)
with main_manifest_path.open() as infile:
self.manifest = json.load(infile)
manifest_paths.append(main_manifest_path)
for file in Path(self.source).glob("**/*-manifest.json"):
with file.open() as infile:
self.manifest += json.load(infile)
manifest_paths.append(file)
version_path = self.source / "version.json"
if version_path.exists():
with version_path.open() as infile:
self.version = json.load(infile)["version"]
# Provide an initial warning if needed to the user
if self.version != version.__full_version_str__:
self.stdout.write(
self.style.WARNING(
"Version mismatch: "
f"Currently {version.__full_version_str__},"
f" importing {self.version}."
" Continuing, but import may fail.",
),
)
else:
self.stdout.write(self.style.NOTICE("No version.json file located"))
self._check_manifest_valid()
with disable_signal(
post_save,
receiver=update_filename_and_move_files,
sender=Document,
), disable_signal(
m2m_changed,
receiver=update_filename_and_move_files,
sender=Document.tags.through,
):
# Fill up the database with whatever is in the manifest
try:
for manifest_path in manifest_paths:
call_command("loaddata", manifest_path)
except (FieldDoesNotExist, DeserializationError) as e:
self.stdout.write(self.style.ERROR("Database import failed"))
if (
self.version is not None
and self.version != version.__full_version_str__
):
self.stdout.write(
self.style.ERROR(
"Version mismatch: "
f"Currently {version.__full_version_str__},"
f" importing {self.version}",
),
)
raise e
else:
self.stdout.write(
self.style.ERROR("No version information present"),
)
raise e
self._import_files_from_manifest(options["no_progress_bar"])
self.stdout.write("Updating search index...")
call_command(
"document_index",
"reindex",
no_progress_bar=options["no_progress_bar"],
)
@staticmethod
def _check_manifest_exists(path: Path):
if not path.exists():
raise CommandError(
"That directory doesn't appear to contain a manifest.json file.",
)
def _check_manifest_valid(self):
"""
Attempts to verify the manifest is valid. Namely checking the files
referred to exist and the files can be read from
"""
self.stdout.write("Checking the manifest")
for record in self.manifest:
if record["model"] != "documents.document":
continue
if EXPORTER_FILE_NAME not in record:
raise CommandError(
"The manifest file contains a record which does not "
"refer to an actual document file.",
)
doc_file = record[EXPORTER_FILE_NAME]
doc_path = self.source / doc_file
if not doc_path.exists():
raise CommandError(
'The manifest file refers to "{}" which does not '
"appear to be in the source directory.".format(doc_file),
)
try:
with doc_path.open(mode="rb") as infile:
infile.read(1)
except Exception as e:
raise CommandError(
f"Failed to read from original file {doc_path}",
) from e
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
doc_archive_path = self.source / archive_file
if not doc_archive_path.exists():
raise CommandError(
f"The manifest file refers to {archive_file} which "
f"does not appear to be in the source directory.",
)
try:
with doc_archive_path.open(mode="rb") as infile:
infile.read(1)
except Exception as e:
raise CommandError(
f"Failed to read from archive file {doc_archive_path}",
) from e
def _import_files_from_manifest(self, progress_bar_disable):
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
self.stdout.write("Copy files into paperless...")
manifest_documents = list(
filter(lambda r: r["model"] == "documents.document", self.manifest),
)
for record in tqdm.tqdm(manifest_documents, disable=progress_bar_disable):
document = Document.objects.get(pk=record["pk"])
doc_file = record[EXPORTER_FILE_NAME]
document_path = os.path.join(self.source, doc_file)
if EXPORTER_THUMBNAIL_NAME in record:
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = Path(os.path.join(self.source, thumb_file)).resolve()
else:
thumbnail_path = None
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
archive_path = os.path.join(self.source, archive_file)
else:
archive_path = None
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
with FileLock(settings.MEDIA_LOCK):
if os.path.isfile(document.source_path):
raise FileExistsError(document.source_path)
create_source_path_directory(document.source_path)
shutil.copy2(document_path, document.source_path)
if thumbnail_path:
if thumbnail_path.suffix in {".png", ".PNG"}:
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{thumbnail_path}[0]",
output_file=str(document.thumbnail_path),
)
else:
shutil.copy2(thumbnail_path, document.thumbnail_path)
if archive_path:
create_source_path_directory(document.archive_path)
# TODO: this assumes that the export is valid and
# archive_filename is present on all documents with
# archived files
shutil.copy2(archive_path, document.archive_path)
document.save()