Compare commits

...

5 Commits

Author SHA1 Message Date
shamoon
e0eb6ea576 Normalize and casefold input in TitleContentFilter 2026-01-07 13:02:43 -08:00
shamoon
9d489200d9 Consolidate 2026-01-05 11:19:35 -08:00
shamoon
99294d93f9 Normalize filenames and titles to NFC 2026-01-05 11:17:16 -08:00
shamoon
d40f7b7e91 Normalize text to NFC for search and indexing 2026-01-05 11:10:21 -08:00
shamoon
8a14548434 Normalize Unicode in workflow filename matching 2026-01-05 11:03:50 -08:00
9 changed files with 231 additions and 28 deletions

View File

@@ -46,6 +46,7 @@ from documents.signals.handlers import run_workflows
from documents.templating.workflows import parse_w_workflow_placeholders from documents.templating.workflows import parse_w_workflow_placeholders
from documents.utils import copy_basic_file_stats from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats from documents.utils import copy_file_with_basic_stats
from documents.utils import normalize_nfc
from documents.utils import run_subprocess from documents.utils import run_subprocess
from paperless_mail.parsers import MailDocumentParser from paperless_mail.parsers import MailDocumentParser
@@ -111,7 +112,12 @@ class ConsumerPluginMixin:
self.renew_logging_group() self.renew_logging_group()
self.filename = self.metadata.filename or self.input_doc.original_file.name self.metadata.filename = normalize_nfc(self.metadata.filename)
self.metadata.title = normalize_nfc(self.metadata.title)
self.filename = normalize_nfc(
self.metadata.filename or self.input_doc.original_file.name,
)
def _send_progress( def _send_progress(
self, self,
@@ -652,6 +658,8 @@ class ConsumerPlugin(
f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}", f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}",
) )
title = normalize_nfc(title)
file_for_checksum = ( file_for_checksum = (
self.unmodified_original self.unmodified_original
if self.unmodified_original is not None if self.unmodified_original is not None

View File

@@ -6,6 +6,7 @@ from django.conf import settings
from documents.models import Document from documents.models import Document
from documents.templating.filepath import validate_filepath_template_and_render from documents.templating.filepath import validate_filepath_template_and_render
from documents.templating.utils import convert_format_str_to_template_format from documents.templating.utils import convert_format_str_to_template_format
from documents.utils import normalize_nfc
def create_source_path_directory(source_path: Path) -> None: def create_source_path_directory(source_path: Path) -> None:
@@ -55,11 +56,11 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
""" """
if archive_filename: if archive_filename:
old_filename: Path | None = ( old_filename: Path | None = (
Path(doc.archive_filename) if doc.archive_filename else None Path(normalize_nfc(doc.archive_filename)) if doc.archive_filename else None
) )
root = settings.ARCHIVE_DIR root = settings.ARCHIVE_DIR
else: else:
old_filename = Path(doc.filename) if doc.filename else None old_filename = Path(normalize_nfc(doc.filename)) if doc.filename else None
root = settings.ORIGINALS_DIR root = settings.ORIGINALS_DIR
# If generating archive filenames, try to make a name that is similar to # If generating archive filenames, try to make a name that is similar to
@@ -91,7 +92,7 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
) )
if new_filename == old_filename: if new_filename == old_filename:
# still the same as before. # still the same as before.
return new_filename return Path(normalize_nfc(str(new_filename)))
if (root / new_filename).exists(): if (root / new_filename).exists():
counter += 1 counter += 1
@@ -119,7 +120,7 @@ def format_filename(document: Document, template_str: str) -> str | None:
"none", "none",
) # backward compatibility ) # backward compatibility
return rendered_filename return normalize_nfc(rendered_filename)
def generate_filename( def generate_filename(
@@ -174,4 +175,4 @@ def generate_filename(
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG: if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
full_path = full_path.with_suffix(full_path.suffix + ".gpg") full_path = full_path.with_suffix(full_path.suffix + ".gpg")
return full_path return Path(normalize_nfc(str(full_path)))

View File

@@ -41,6 +41,7 @@ from documents.models import PaperlessTask
from documents.models import ShareLink from documents.models import ShareLink
from documents.models import StoragePath from documents.models import StoragePath
from documents.models import Tag from documents.models import Tag
from documents.utils import normalize_nfc
if TYPE_CHECKING: if TYPE_CHECKING:
from collections.abc import Callable from collections.abc import Callable
@@ -162,7 +163,11 @@ class TitleContentFilter(Filter):
def filter(self, qs, value): def filter(self, qs, value):
value = value.strip() if isinstance(value, str) else value value = value.strip() if isinstance(value, str) else value
if value: if value:
return qs.filter(Q(title__icontains=value) | Q(content__icontains=value)) normalized = normalize_nfc(value) or ""
folded = normalized.casefold()
return qs.filter(
Q(title__icontains=folded) | Q(content__icontains=folded),
)
else: else:
return qs return qs

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import logging import logging
import math import math
import re import re
import unicodedata
from collections import Counter from collections import Counter
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime from datetime import datetime
@@ -58,6 +59,14 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.index") logger = logging.getLogger("paperless.index")
def _normalize_for_index(value: str | None) -> str | None:
"""Normalize text to NFC for consistent search/index matching."""
if value is None:
return None
return unicodedata.normalize("NFC", value)
def get_schema() -> Schema: def get_schema() -> Schema:
return Schema( return Schema(
id=NUMERIC(stored=True, unique=True), id=NUMERIC(stored=True, unique=True),
@@ -163,37 +172,41 @@ def update_document(writer: AsyncWriter, doc: Document) -> None:
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms]) viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
writer.update_document( writer.update_document(
id=doc.pk, id=doc.pk,
title=doc.title, title=_normalize_for_index(doc.title),
content=doc.content, content=_normalize_for_index(doc.content),
correspondent=doc.correspondent.name if doc.correspondent else None, correspondent=_normalize_for_index(
doc.correspondent.name if doc.correspondent else None,
),
correspondent_id=doc.correspondent.id if doc.correspondent else None, correspondent_id=doc.correspondent.id if doc.correspondent else None,
has_correspondent=doc.correspondent is not None, has_correspondent=doc.correspondent is not None,
tag=tags if tags else None, tag=_normalize_for_index(tags) if tags else None,
tag_id=tags_ids if tags_ids else None, tag_id=tags_ids if tags_ids else None,
has_tag=len(tags) > 0, has_tag=len(tags) > 0,
type=doc.document_type.name if doc.document_type else None, type=_normalize_for_index(
doc.document_type.name if doc.document_type else None,
),
type_id=doc.document_type.id if doc.document_type else None, type_id=doc.document_type.id if doc.document_type else None,
has_type=doc.document_type is not None, has_type=doc.document_type is not None,
created=datetime.combine(doc.created, time.min), created=datetime.combine(doc.created, time.min),
added=doc.added, added=doc.added,
asn=asn, asn=asn,
modified=doc.modified, modified=doc.modified,
path=doc.storage_path.name if doc.storage_path else None, path=_normalize_for_index(doc.storage_path.name if doc.storage_path else None),
path_id=doc.storage_path.id if doc.storage_path else None, path_id=doc.storage_path.id if doc.storage_path else None,
has_path=doc.storage_path is not None, has_path=doc.storage_path is not None,
notes=notes, notes=_normalize_for_index(notes),
num_notes=len(notes), num_notes=len(notes),
custom_fields=custom_fields, custom_fields=_normalize_for_index(custom_fields),
custom_field_count=len(doc.custom_fields.all()), custom_field_count=len(doc.custom_fields.all()),
has_custom_fields=len(custom_fields) > 0, has_custom_fields=len(custom_fields) > 0,
custom_fields_id=custom_fields_ids if custom_fields_ids else None, custom_fields_id=custom_fields_ids if custom_fields_ids else None,
owner=doc.owner.username if doc.owner else None, owner=_normalize_for_index(doc.owner.username if doc.owner else None),
owner_id=doc.owner.id if doc.owner else None, owner_id=doc.owner.id if doc.owner else None,
has_owner=doc.owner is not None, has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None, viewer_id=viewer_ids if viewer_ids else None,
checksum=doc.checksum, checksum=doc.checksum,
page_count=doc.page_count, page_count=doc.page_count,
original_filename=doc.original_filename, original_filename=_normalize_for_index(doc.original_filename),
is_shared=len(viewer_ids) > 0, is_shared=len(viewer_ids) > 0,
) )
logger.debug(f"Index updated for document {doc.pk}.") logger.debug(f"Index updated for document {doc.pk}.")
@@ -421,7 +434,7 @@ class LocalDateParser(English):
class DelayedFullTextQuery(DelayedQuery): class DelayedFullTextQuery(DelayedQuery):
def _get_query(self) -> tuple: def _get_query(self) -> tuple:
q_str = self.query_params["query"] q_str = _normalize_for_index(self.query_params["query"]) or ""
q_str = rewrite_natural_date_keywords(q_str) q_str = rewrite_natural_date_keywords(q_str)
qp = MultifieldParser( qp = MultifieldParser(
[ [
@@ -460,7 +473,12 @@ class DelayedFullTextQuery(DelayedQuery):
class DelayedMoreLikeThisQuery(DelayedQuery): class DelayedMoreLikeThisQuery(DelayedQuery):
def _get_query(self) -> tuple: def _get_query(self) -> tuple:
more_like_doc_id = int(self.query_params["more_like_id"]) more_like_doc_id = int(self.query_params["more_like_id"])
content = Document.objects.get(id=more_like_doc_id).content content = (
_normalize_for_index(
Document.objects.get(id=more_like_doc_id).content,
)
or ""
)
docnum = self.searcher.document_number(id=more_like_doc_id) docnum = self.searcher.document_number(id=more_like_doc_id)
kts = self.searcher.key_terms_from_text( kts = self.searcher.key_terms_from_text(
@@ -488,6 +506,7 @@ def autocomplete(
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
and without scoring and without scoring
""" """
term = _normalize_for_index(term) or ""
terms = [] terms = []
with ix.searcher(weighting=TF_IDF()) as s: with ix.searcher(weighting=TF_IDF()) as s:

View File

@@ -2,10 +2,12 @@ from __future__ import annotations
import logging import logging
import re import re
import unicodedata
from fnmatch import fnmatch from fnmatch import fnmatch
from fnmatch import translate as fnmatch_translate from fnmatch import translate as fnmatch_translate
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from django.db.models import Q
from rest_framework import serializers from rest_framework import serializers
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
@@ -21,6 +23,7 @@ from documents.models import Workflow
from documents.models import WorkflowTrigger from documents.models import WorkflowTrigger
from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import get_objects_for_user_owner_aware
from documents.regex import safe_regex_search from documents.regex import safe_regex_search
from documents.utils import normalize_nfc
if TYPE_CHECKING: if TYPE_CHECKING:
from django.db.models import QuerySet from django.db.models import QuerySet
@@ -30,6 +33,34 @@ if TYPE_CHECKING:
logger = logging.getLogger("paperless.matching") logger = logging.getLogger("paperless.matching")
def _normalize_glob_value(value: str) -> str:
"""Normalize strings for glob-style matching (case-insensitive)."""
return (normalize_nfc(value) or "").casefold()
def _normalized_fnmatch(name: str, pattern: str) -> bool:
"""Canonicalize Unicode and compare using fnmatch semantics."""
return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
def _glob_regex_variants(pattern: str) -> list[str]:
"""
Build regex patterns that match both NFC and NFD forms of a glob pattern.
Using both forms lets DB prefilters remain Unicode-normalization agnostic.
"""
regexes = set()
for normalized in {
normalize_nfc(pattern) or "",
unicodedata.normalize("NFD", pattern),
}:
regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
regexes.add(regex)
return list(regexes)
def log_reason( def log_reason(
matching_model: MatchingModel | WorkflowTrigger, matching_model: MatchingModel | WorkflowTrigger,
document: Document, document: Document,
@@ -305,9 +336,9 @@ def consumable_document_matches_workflow(
if ( if (
trigger.filter_filename is not None trigger.filter_filename is not None
and len(trigger.filter_filename) > 0 and len(trigger.filter_filename) > 0
and not fnmatch( and not _normalized_fnmatch(
document.original_file.name.lower(), document.original_file.name,
trigger.filter_filename.lower(), trigger.filter_filename,
) )
): ):
reason = ( reason = (
@@ -328,7 +359,7 @@ def consumable_document_matches_workflow(
if ( if (
trigger.filter_path is not None trigger.filter_path is not None
and len(trigger.filter_path) > 0 and len(trigger.filter_path) > 0
and not fnmatch( and not _normalized_fnmatch(
match_against, match_against,
trigger.filter_path, trigger.filter_path,
) )
@@ -492,9 +523,9 @@ def existing_document_matches_workflow(
trigger.filter_filename is not None trigger.filter_filename is not None
and len(trigger.filter_filename) > 0 and len(trigger.filter_filename) > 0
and document.original_filename is not None and document.original_filename is not None
and not fnmatch( and not _normalized_fnmatch(
document.original_filename.lower(), document.original_filename,
trigger.filter_filename.lower(), trigger.filter_filename,
) )
): ):
return ( return (
@@ -573,8 +604,11 @@ def prefilter_documents_by_workflowtrigger(
documents = documents.annotate(**annotations).filter(custom_field_q) documents = documents.annotate(**annotations).filter(custom_field_q)
if trigger.filter_filename: if trigger.filter_filename:
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$") regexes = _glob_regex_variants(trigger.filter_filename)
documents = documents.filter(original_filename__iregex=regex) filename_q = Q()
for regex in regexes:
filename_q |= Q(original_filename__iregex=regex)
documents = documents.filter(filename_q)
return documents return documents

View File

@@ -89,6 +89,23 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
self.assertEqual(len(results), 0) self.assertEqual(len(results), 0)
self.assertCountEqual(response.data["all"], []) self.assertCountEqual(response.data["all"], [])
def test_search_handles_diacritics_normalization(self):
doc = Document.objects.create(
title="certida\u0303o de nascimento",
content="birth record without keyword",
checksum="D",
pk=10,
)
with AsyncWriter(index.open_index()) as writer:
index.update_document(writer, doc)
response = self.client.get("/api/documents/?query=certidão")
self.assertEqual(response.status_code, status.HTTP_200_OK)
results = response.data["results"]
self.assertEqual(response.data["count"], 1)
self.assertEqual(len(results), 1)
self.assertEqual(results[0]["id"], doc.id)
def test_search_custom_field_ordering(self): def test_search_custom_field_ordering(self):
custom_field = CustomField.objects.create( custom_field = CustomField.objects.create(
name="Sortable field", name="Sortable field",

View File

@@ -290,6 +290,23 @@ class TestConsumer(
self._assert_first_last_send_progress() self._assert_first_last_send_progress()
def test_override_filename_normalized(self):
filename = self.get_test_file()
override_filename = "Inhaltsu\u0308bersicht.pdf"
with self.get_consumer(
filename,
DocumentMetadataOverrides(filename=override_filename),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
self.assertEqual(document.original_filename, "Inhaltsübersicht.pdf")
self.assertEqual(document.title, "Inhaltsübersicht")
self._assert_first_last_send_progress()
def testOverrideTitle(self): def testOverrideTitle(self):
with self.get_consumer( with self.get_consumer(
self.get_test_file(), self.get_test_file(),
@@ -304,6 +321,25 @@ class TestConsumer(
self.assertEqual(document.title, "Override Title") self.assertEqual(document.title, "Override Title")
self._assert_first_last_send_progress() self._assert_first_last_send_progress()
@override_settings(FILENAME_FORMAT="{{ title }}")
def test_filename_format_normalized(self):
filename = self.get_test_file()
title = "Inhaltsu\u0308bersicht Faszination"
with self.get_consumer(
filename,
DocumentMetadataOverrides(title=title),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
self.assertEqual(document.title, "Inhaltsübersicht Faszination")
self.assertEqual(document.filename, "Inhaltsübersicht Faszination.pdf")
self.assertIsFile(document.source_path)
self._assert_first_last_send_progress()
def testOverrideCorrespondent(self): def testOverrideCorrespondent(self):
c = Correspondent.objects.create(name="test") c = Correspondent.objects.create(name="test")

View File

@@ -557,6 +557,50 @@ class TestWorkflows(
expected_str = f"Document filename {test_file.name} does not match" expected_str = f"Document filename {test_file.name} does not match"
self.assertIn(expected_str, cm.output[1]) self.assertIn(expected_str, cm.output[1])
def test_workflow_match_filename_diacritics_normalized(self):
"""
GIVEN:
- Consumption workflow filtering on filename with diacritics
WHEN:
- File with decomposed Unicode filename is consumed
THEN:
- Workflow still matches and applies overrides
"""
trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
filter_filename="*račun*",
)
action = WorkflowAction.objects.create(
assign_title="Diacritics matched",
)
action.save()
w = Workflow.objects.create(
name="Workflow 1",
order=0,
)
w.triggers.add(trigger)
w.actions.add(action)
w.save()
decomposed_name = "rac\u030cun.pdf"
test_file = shutil.copy(
self.SAMPLE_DIR / "simple.pdf",
self.dirs.scratch_dir / decomposed_name,
)
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=test_file,
),
None,
)
document = Document.objects.first()
self.assertEqual(document.title, "Diacritics matched")
def test_workflow_no_match_path(self): def test_workflow_no_match_path(self):
""" """
GIVEN: GIVEN:
@@ -946,6 +990,35 @@ class TestWorkflows(
self.assertEqual(doc.correspondent, self.c2) self.assertEqual(doc.correspondent, self.c2)
self.assertEqual(doc.title, f"Doc created in {created.year}") self.assertEqual(doc.title, f"Doc created in {created.year}")
def test_document_added_filename_diacritics_normalized(self):
trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
filter_filename="*račun*",
)
action = WorkflowAction.objects.create(
assign_title="Matched diacritics",
)
w = Workflow.objects.create(
name="Workflow 1",
order=0,
)
w.triggers.add(trigger)
w.actions.add(action)
w.save()
doc = Document.objects.create(
title="sample test",
correspondent=self.c,
original_filename="rac\u030cun.pdf",
)
document_consumption_finished.send(
sender=self.__class__,
document=doc,
)
self.assertEqual(doc.title, "Matched diacritics")
def test_document_added_no_match_filename(self): def test_document_added_no_match_filename(self):
trigger = WorkflowTrigger.objects.create( trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED, type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,

View File

@@ -1,5 +1,7 @@
import logging import logging
import shutil import shutil
import unicodedata
from os import PathLike
from os import utime from os import utime
from pathlib import Path from pathlib import Path
from subprocess import CompletedProcess from subprocess import CompletedProcess
@@ -16,6 +18,14 @@ def _coerce_to_path(
return Path(source).resolve(), Path(dest).resolve() return Path(source).resolve(), Path(dest).resolve()
def normalize_nfc(value: str | PathLike[str] | None) -> str | None:
"""Return NFC-normalized string for filesystem-safe comparisons."""
if value is None:
return None
return unicodedata.normalize("NFC", str(value))
def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None: def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None:
""" """
Copies only the m_time and a_time attributes from source to destination. Copies only the m_time and a_time attributes from source to destination.