Normalize filenames and titles to NFC

This commit is contained in:
shamoon
2026-01-05 11:17:16 -08:00
parent d40f7b7e91
commit 99294d93f9
4 changed files with 61 additions and 6 deletions

View File

@@ -46,6 +46,7 @@ from documents.signals.handlers import run_workflows
from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import normalize_nfc
from documents.utils import run_subprocess
from paperless_mail.parsers import MailDocumentParser
@@ -111,7 +112,12 @@ class ConsumerPluginMixin:
self.renew_logging_group()
self.filename = self.metadata.filename or self.input_doc.original_file.name
self.metadata.filename = normalize_nfc(self.metadata.filename)
self.metadata.title = normalize_nfc(self.metadata.title)
self.filename = normalize_nfc(
self.metadata.filename or self.input_doc.original_file.name,
)
def _send_progress(
self,
@@ -652,6 +658,8 @@ class ConsumerPlugin(
f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}",
)
title = normalize_nfc(title)
file_for_checksum = (
self.unmodified_original
if self.unmodified_original is not None

View File

@@ -6,6 +6,7 @@ from django.conf import settings
from documents.models import Document
from documents.templating.filepath import validate_filepath_template_and_render
from documents.templating.utils import convert_format_str_to_template_format
from documents.utils import normalize_nfc
def create_source_path_directory(source_path: Path) -> None:
@@ -55,11 +56,11 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
"""
if archive_filename:
old_filename: Path | None = (
Path(doc.archive_filename) if doc.archive_filename else None
Path(normalize_nfc(doc.archive_filename)) if doc.archive_filename else None
)
root = settings.ARCHIVE_DIR
else:
old_filename = Path(doc.filename) if doc.filename else None
old_filename = Path(normalize_nfc(doc.filename)) if doc.filename else None
root = settings.ORIGINALS_DIR
# If generating archive filenames, try to make a name that is similar to
@@ -91,7 +92,7 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
)
if new_filename == old_filename:
# still the same as before.
return new_filename
return Path(normalize_nfc(str(new_filename)))
if (root / new_filename).exists():
counter += 1
@@ -119,7 +120,7 @@ def format_filename(document: Document, template_str: str) -> str | None:
"none",
) # backward compatibility
return rendered_filename
return normalize_nfc(rendered_filename)
def generate_filename(
@@ -174,4 +175,4 @@ def generate_filename(
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
full_path = full_path.with_suffix(full_path.suffix + ".gpg")
return full_path
return Path(normalize_nfc(str(full_path)))

View File

@@ -290,6 +290,23 @@ class TestConsumer(
self._assert_first_last_send_progress()
def test_override_filename_normalized(self):
filename = self.get_test_file()
override_filename = "Inhaltsu\u0308bersicht.pdf"
with self.get_consumer(
filename,
DocumentMetadataOverrides(filename=override_filename),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
self.assertEqual(document.original_filename, "Inhaltsübersicht.pdf")
self.assertEqual(document.title, "Inhaltsübersicht")
self._assert_first_last_send_progress()
def testOverrideTitle(self):
with self.get_consumer(
self.get_test_file(),
@@ -304,6 +321,25 @@ class TestConsumer(
self.assertEqual(document.title, "Override Title")
self._assert_first_last_send_progress()
@override_settings(FILENAME_FORMAT="{{ title }}")
def test_filename_format_normalized(self):
filename = self.get_test_file()
title = "Inhaltsu\u0308bersicht Faszination"
with self.get_consumer(
filename,
DocumentMetadataOverrides(title=title),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
self.assertEqual(document.title, "Inhaltsübersicht Faszination")
self.assertEqual(document.filename, "Inhaltsübersicht Faszination.pdf")
self.assertIsFile(document.source_path)
self._assert_first_last_send_progress()
def testOverrideCorrespondent(self):
c = Correspondent.objects.create(name="test")

View File

@@ -1,5 +1,7 @@
import logging
import shutil
import unicodedata
from os import PathLike
from os import utime
from pathlib import Path
from subprocess import CompletedProcess
@@ -16,6 +18,14 @@ def _coerce_to_path(
return Path(source).resolve(), Path(dest).resolve()
def normalize_nfc(value: str | PathLike[str] | None) -> str | None:
"""Return NFC-normalized string for filesystem-safe comparisons."""
if value is None:
return None
return unicodedata.normalize("NFC", str(value))
def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None:
"""
Copies only the m_time and a_time attributes from source to destination.