Normalize filenames and titles to NFC

This commit is contained in:
shamoon
2026-01-05 11:17:16 -08:00
parent d40f7b7e91
commit 99294d93f9
4 changed files with 61 additions and 6 deletions

View File

@@ -46,6 +46,7 @@ from documents.signals.handlers import run_workflows
from documents.templating.workflows import parse_w_workflow_placeholders from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import copy_basic_file_stats from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
from documents.utils import normalize_nfc
from documents.utils import run_subprocess from documents.utils import run_subprocess
from paperless_mail.parsers import MailDocumentParser from paperless_mail.parsers import MailDocumentParser
@@ -111,7 +112,12 @@ class ConsumerPluginMixin:
self.renew_logging_group() self.renew_logging_group()
self.filename = self.metadata.filename or self.input_doc.original_file.name self.metadata.filename = normalize_nfc(self.metadata.filename)
self.metadata.title = normalize_nfc(self.metadata.title)
self.filename = normalize_nfc(
self.metadata.filename or self.input_doc.original_file.name,
)
def _send_progress( def _send_progress(
self, self,
@@ -652,6 +658,8 @@ class ConsumerPlugin(
f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}", f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}",
) )
title = normalize_nfc(title)
file_for_checksum = ( file_for_checksum = (
self.unmodified_original self.unmodified_original
if self.unmodified_original is not None if self.unmodified_original is not None

View File

@@ -6,6 +6,7 @@ from django.conf import settings
from documents.models import Document from documents.models import Document
from documents.templating.filepath import validate_filepath_template_and_render from documents.templating.filepath import validate_filepath_template_and_render
from documents.templating.utils import convert_format_str_to_template_format from documents.templating.utils import convert_format_str_to_template_format
from documents.utils import normalize_nfc
def create_source_path_directory(source_path: Path) -> None: def create_source_path_directory(source_path: Path) -> None:
@@ -55,11 +56,11 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
""" """
if archive_filename: if archive_filename:
old_filename: Path | None = ( old_filename: Path | None = (
Path(doc.archive_filename) if doc.archive_filename else None Path(normalize_nfc(doc.archive_filename)) if doc.archive_filename else None
) )
root = settings.ARCHIVE_DIR root = settings.ARCHIVE_DIR
else: else:
old_filename = Path(doc.filename) if doc.filename else None old_filename = Path(normalize_nfc(doc.filename)) if doc.filename else None
root = settings.ORIGINALS_DIR root = settings.ORIGINALS_DIR
# If generating archive filenames, try to make a name that is similar to # If generating archive filenames, try to make a name that is similar to
@@ -91,7 +92,7 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
) )
if new_filename == old_filename: if new_filename == old_filename:
# still the same as before. # still the same as before.
return new_filename return Path(normalize_nfc(str(new_filename)))
if (root / new_filename).exists(): if (root / new_filename).exists():
counter += 1 counter += 1
@@ -119,7 +120,7 @@ def format_filename(document: Document, template_str: str) -> str | None:
"none", "none",
) # backward compatibility ) # backward compatibility
return rendered_filename return normalize_nfc(rendered_filename)
def generate_filename( def generate_filename(
@@ -174,4 +175,4 @@ def generate_filename(
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG: if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
full_path = full_path.with_suffix(full_path.suffix + ".gpg") full_path = full_path.with_suffix(full_path.suffix + ".gpg")
return full_path return Path(normalize_nfc(str(full_path)))

View File

@@ -290,6 +290,23 @@ class TestConsumer(
self._assert_first_last_send_progress() self._assert_first_last_send_progress()
def test_override_filename_normalized(self):
filename = self.get_test_file()
override_filename = "Inhaltsu\u0308bersicht.pdf"
with self.get_consumer(
filename,
DocumentMetadataOverrides(filename=override_filename),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
self.assertEqual(document.original_filename, "Inhaltsübersicht.pdf")
self.assertEqual(document.title, "Inhaltsübersicht")
self._assert_first_last_send_progress()
def testOverrideTitle(self): def testOverrideTitle(self):
with self.get_consumer( with self.get_consumer(
self.get_test_file(), self.get_test_file(),
@@ -304,6 +321,25 @@ class TestConsumer(
self.assertEqual(document.title, "Override Title") self.assertEqual(document.title, "Override Title")
self._assert_first_last_send_progress() self._assert_first_last_send_progress()
@override_settings(FILENAME_FORMAT="{{ title }}")
def test_filename_format_normalized(self):
filename = self.get_test_file()
title = "Inhaltsu\u0308bersicht Faszination"
with self.get_consumer(
filename,
DocumentMetadataOverrides(title=title),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
self.assertEqual(document.title, "Inhaltsübersicht Faszination")
self.assertEqual(document.filename, "Inhaltsübersicht Faszination.pdf")
self.assertIsFile(document.source_path)
self._assert_first_last_send_progress()
def testOverrideCorrespondent(self): def testOverrideCorrespondent(self):
c = Correspondent.objects.create(name="test") c = Correspondent.objects.create(name="test")

View File

@@ -1,5 +1,7 @@
import logging import logging
import shutil import shutil
import unicodedata
from os import PathLike
from os import utime from os import utime
from pathlib import Path from pathlib import Path
from subprocess import CompletedProcess from subprocess import CompletedProcess
@@ -16,6 +18,14 @@ def _coerce_to_path(
return Path(source).resolve(), Path(dest).resolve() return Path(source).resolve(), Path(dest).resolve()
def normalize_nfc(value: str | PathLike[str] | None) -> str | None:
"""Return NFC-normalized string for filesystem-safe comparisons."""
if value is None:
return None
return unicodedata.normalize("NFC", str(value))
def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None: def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None:
""" """
Copies only the m_time and a_time attributes from source to destination. Copies only the m_time and a_time attributes from source to destination.