paperless-ngx/src/documents/file_handling.py

205 lines
6.2 KiB
Python

import datetime
import logging
import os
from collections import defaultdict
import pathvalidate
from django.conf import settings
from django.template.defaultfilters import slugify
from django.utils import timezone
logger = logging.getLogger("paperless.filehandling")
class defaultdictNoStr(defaultdict):
def __str__(self):
raise ValueError("Don't use {tags} directly.")
def create_source_path_directory(source_path):
os.makedirs(os.path.dirname(source_path), exist_ok=True)
def delete_empty_directories(directory, root):
if not os.path.isdir(directory):
return
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(root)
if not directory.startswith(root + os.path.sep):
# don't do anything outside our originals folder.
# append os.path.set so that we avoid these cases:
# directory = /home/originals2/test
# root = /home/originals ("/" gets appended and startswith fails)
return
while directory != root:
if not os.listdir(directory):
# it's empty
try:
os.rmdir(directory)
except OSError:
# whatever. empty directories aren't that bad anyway.
return
else:
# it's not empty.
return
# go one level up
directory = os.path.normpath(os.path.dirname(directory))
def many_to_dictionary(field):
# Converts ManyToManyField to dictionary by assuming, that field
# entries contain an _ or - which will be used as a delimiter
mydictionary = dict()
for index, t in enumerate(field.all()):
# Populate tag names by index
mydictionary[index] = slugify(t.name)
# Find delimiter
delimiter = t.name.find("_")
if delimiter == -1:
delimiter = t.name.find("-")
if delimiter == -1:
continue
key = t.name[:delimiter]
value = t.name[delimiter + 1 :]
mydictionary[slugify(key)] = slugify(value)
return mydictionary
def generate_unique_filename(doc, archive_filename=False):
"""
Generates a unique filename for doc in settings.ORIGINALS_DIR.
The returned filename is guaranteed to be either the current filename
of the document if unchanged, or a new filename that does not correspondent
to any existing files. The function will append _01, _02, etc to the
filename before the extension to avoid conflicts.
If archive_filename is True, return a unique archive filename instead.
"""
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
# If generating archive filenames, try to make a name that is similar to
# the original filename first.
if archive_filename and doc.filename:
new_filename = os.path.splitext(doc.filename)[0] + ".pdf"
if new_filename == old_filename or not os.path.exists(
os.path.join(root, new_filename),
):
return new_filename
counter = 0
while True:
new_filename = generate_filename(
doc,
counter,
archive_filename=archive_filename,
)
if new_filename == old_filename:
# still the same as before.
return new_filename
if os.path.exists(os.path.join(root, new_filename)):
counter += 1
else:
return new_filename
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
if settings.PAPERLESS_FILENAME_FORMAT is not None:
tags = defaultdictNoStr(lambda: slugify(None), many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
",".join(sorted(tag.name for tag in doc.tags.all())),
replacement_text="-",
)
if doc.correspondent:
correspondent = pathvalidate.sanitize_filename(
doc.correspondent.name,
replacement_text="-",
)
else:
correspondent = "none"
if doc.document_type:
document_type = pathvalidate.sanitize_filename(
doc.document_type.name,
replacement_text="-",
)
else:
document_type = "none"
if doc.archive_serial_number:
asn = str(doc.archive_serial_number)
else:
asn = "none"
# Convert UTC database date to localized date
local_added = timezone.localdate(doc.added)
local_created = timezone.localdate(doc.created)
path = settings.PAPERLESS_FILENAME_FORMAT.format(
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(local_created),
created_year=local_created.year,
created_month=f"{local_created.month:02}",
created_day=f"{local_created.day:02}",
added=datetime.date.isoformat(local_added),
added_year=local_added.year,
added_month=f"{local_added.month:02}",
added_day=f"{local_added.day:02}",
asn=asn,
tags=tags,
tag_list=tag_list,
).strip()
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default",
)
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
filename += ".gpg"
return filename