mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-08 21:24:26 -06:00
Compare commits
6 Commits
feature-be
...
feature-un
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e0eb6ea576 | ||
|
|
9d489200d9 | ||
|
|
99294d93f9 | ||
|
|
d40f7b7e91 | ||
|
|
8a14548434 | ||
|
|
b145878d50 |
20
.github/workflows/ci.yml
vendored
20
.github/workflows/ci.yml
vendored
@@ -115,7 +115,7 @@ jobs:
|
||||
--frozen \
|
||||
mkdocs gh-deploy --force --no-history
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v5
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: documentation
|
||||
path: site/
|
||||
@@ -215,7 +215,7 @@ jobs:
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
id: cache-frontend-deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -248,7 +248,7 @@ jobs:
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
id: cache-frontend-deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -301,7 +301,7 @@ jobs:
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
id: cache-frontend-deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -333,7 +333,7 @@ jobs:
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
id: cache-frontend-deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
@@ -476,7 +476,7 @@ jobs:
|
||||
docker cp frontend-extract:/usr/src/paperless/src/documents/static/frontend src/documents/static/frontend/
|
||||
- name: Upload frontend artifact
|
||||
if: steps.build-vars.outputs.can-push == 'true'
|
||||
uses: actions/upload-artifact@v5
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: frontend-compiled
|
||||
path: src/documents/static/frontend/
|
||||
@@ -510,12 +510,12 @@ jobs:
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -qq --no-install-recommends gettext liblept5
|
||||
- name: Download frontend artifact
|
||||
uses: actions/download-artifact@v6
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: frontend-compiled
|
||||
path: src/documents/static/frontend/
|
||||
- name: Download documentation artifact
|
||||
uses: actions/download-artifact@v6
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: documentation
|
||||
path: docs/_build/html/
|
||||
@@ -578,7 +578,7 @@ jobs:
|
||||
sudo chown -R 1000:1000 paperless-ngx/
|
||||
tar -cJf paperless-ngx.tar.xz paperless-ngx/
|
||||
- name: Upload release artifact
|
||||
uses: actions/upload-artifact@v5
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: release
|
||||
path: dist/paperless-ngx.tar.xz
|
||||
@@ -595,7 +595,7 @@ jobs:
|
||||
if: github.ref_type == 'tag' && (startsWith(github.ref_name, 'v') || contains(github.ref_name, '-beta.rc'))
|
||||
steps:
|
||||
- name: Download release artifact
|
||||
uses: actions/download-artifact@v6
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: release
|
||||
path: ./
|
||||
|
||||
2
.github/workflows/repo-maintenance.yml
vendored
2
.github/workflows/repo-maintenance.yml
vendored
@@ -37,7 +37,7 @@ jobs:
|
||||
if: github.repository_owner == 'paperless-ngx'
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v5
|
||||
- uses: dessant/lock-threads@v6
|
||||
with:
|
||||
issue-inactive-days: '30'
|
||||
pr-inactive-days: '30'
|
||||
|
||||
2
.github/workflows/translate-strings.yml
vendored
2
.github/workflows/translate-strings.yml
vendored
@@ -47,7 +47,7 @@ jobs:
|
||||
cache-dependency-path: 'src-ui/pnpm-lock.yaml'
|
||||
- name: Cache frontend dependencies
|
||||
id: cache-frontend-deps
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v5
|
||||
with:
|
||||
path: |
|
||||
~/.pnpm-store
|
||||
|
||||
@@ -16,7 +16,6 @@ from pikepdf import Pdf
|
||||
from documents.converters import convert_from_tiff_to_pdf
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.models import Document
|
||||
from documents.models import Tag
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import StopConsumeTaskError
|
||||
@@ -116,24 +115,6 @@ class BarcodePlugin(ConsumeTaskPlugin):
|
||||
self._tiff_conversion_done = False
|
||||
self.barcodes: list[Barcode] = []
|
||||
|
||||
def _apply_detected_asn(self, detected_asn: int) -> None:
|
||||
"""
|
||||
Apply a detected ASN to metadata if allowed.
|
||||
"""
|
||||
if (
|
||||
self.metadata.skip_asn_if_exists
|
||||
and Document.global_objects.filter(
|
||||
archive_serial_number=detected_asn,
|
||||
).exists()
|
||||
):
|
||||
logger.info(
|
||||
f"Found ASN in barcode {detected_asn} but skipping because it already exists.",
|
||||
)
|
||||
return
|
||||
|
||||
logger.info(f"Found ASN in barcode: {detected_asn}")
|
||||
self.metadata.asn = detected_asn
|
||||
|
||||
def run(self) -> None:
|
||||
# Some operations may use PIL, override pixel setting if needed
|
||||
maybe_override_pixel_limit()
|
||||
@@ -205,8 +186,13 @@ class BarcodePlugin(ConsumeTaskPlugin):
|
||||
|
||||
# Update/overwrite an ASN if possible
|
||||
# After splitting, as otherwise each split document gets the same ASN
|
||||
if self.settings.barcode_enable_asn and (located_asn := self.asn) is not None:
|
||||
self._apply_detected_asn(located_asn)
|
||||
if (
|
||||
self.settings.barcode_enable_asn
|
||||
and not self.metadata.skip_asn
|
||||
and (located_asn := self.asn) is not None
|
||||
):
|
||||
logger.info(f"Found ASN in barcode: {located_asn}")
|
||||
self.metadata.asn = located_asn
|
||||
|
||||
def cleanup(self) -> None:
|
||||
self.temp_dir.cleanup()
|
||||
|
||||
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Literal
|
||||
|
||||
from celery import chain
|
||||
from celery import chord
|
||||
from celery import group
|
||||
from celery import shared_task
|
||||
@@ -37,42 +38,6 @@ if TYPE_CHECKING:
|
||||
logger: logging.Logger = logging.getLogger("paperless.bulk_edit")
|
||||
|
||||
|
||||
@shared_task(bind=True)
|
||||
def restore_archive_serial_numbers_task(
|
||||
self,
|
||||
backup: dict[int, int],
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
restore_archive_serial_numbers(backup)
|
||||
|
||||
|
||||
def release_archive_serial_numbers(doc_ids: list[int]) -> dict[int, int]:
|
||||
"""
|
||||
Clears ASNs on documents that are about to be replaced so new documents
|
||||
can be assigned ASNs without uniqueness collisions. Returns a backup map
|
||||
of doc_id -> previous ASN for potential restoration.
|
||||
"""
|
||||
qs = Document.objects.filter(
|
||||
id__in=doc_ids,
|
||||
archive_serial_number__isnull=False,
|
||||
).only("pk", "archive_serial_number")
|
||||
backup = dict(qs.values_list("pk", "archive_serial_number"))
|
||||
qs.update(archive_serial_number=None)
|
||||
logger.info(f"Released archive serial numbers for documents {list(backup.keys())}")
|
||||
return backup
|
||||
|
||||
|
||||
def restore_archive_serial_numbers(backup: dict[int, int]) -> None:
|
||||
"""
|
||||
Restores ASNs using the provided backup map, intended for
|
||||
rollback when replacement consumption fails.
|
||||
"""
|
||||
for doc_id, asn in backup.items():
|
||||
Document.objects.filter(pk=doc_id).update(archive_serial_number=asn)
|
||||
logger.info(f"Restored archive serial numbers for documents {list(backup.keys())}")
|
||||
|
||||
|
||||
def set_correspondent(
|
||||
doc_ids: list[int],
|
||||
correspondent: Correspondent,
|
||||
@@ -421,7 +386,6 @@ def merge(
|
||||
|
||||
merged_pdf = pikepdf.new()
|
||||
version: str = merged_pdf.pdf_version
|
||||
handoff_asn: int | None = None
|
||||
# use doc_ids to preserve order
|
||||
for doc_id in doc_ids:
|
||||
doc = qs.get(id=doc_id)
|
||||
@@ -437,8 +401,6 @@ def merge(
|
||||
version = max(version, pdf.pdf_version)
|
||||
merged_pdf.pages.extend(pdf.pages)
|
||||
affected_docs.append(doc.id)
|
||||
if handoff_asn is None and doc.archive_serial_number is not None:
|
||||
handoff_asn = doc.archive_serial_number
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Error merging document {doc.id}, it will not be included in the merge: {e}",
|
||||
@@ -464,8 +426,6 @@ def merge(
|
||||
DocumentMetadataOverrides.from_document(metadata_document)
|
||||
)
|
||||
overrides.title = metadata_document.title + " (merged)"
|
||||
if metadata_document.archive_serial_number is not None:
|
||||
handoff_asn = metadata_document.archive_serial_number
|
||||
else:
|
||||
overrides = DocumentMetadataOverrides()
|
||||
else:
|
||||
@@ -473,11 +433,8 @@ def merge(
|
||||
|
||||
if user is not None:
|
||||
overrides.owner_id = user.id
|
||||
if not delete_originals:
|
||||
overrides.skip_asn_if_exists = True
|
||||
|
||||
if delete_originals and handoff_asn is not None:
|
||||
overrides.asn = handoff_asn
|
||||
# Avoid copying or detecting ASN from merged PDFs to prevent collision
|
||||
overrides.skip_asn = True
|
||||
|
||||
logger.info("Adding merged document to the task queue.")
|
||||
|
||||
@@ -490,20 +447,12 @@ def merge(
|
||||
)
|
||||
|
||||
if delete_originals:
|
||||
backup = release_archive_serial_numbers(affected_docs)
|
||||
logger.info(
|
||||
"Queueing removal of original documents after consumption of merged document",
|
||||
)
|
||||
try:
|
||||
consume_task.apply_async(
|
||||
link=[delete.si(affected_docs)],
|
||||
link_error=[restore_archive_serial_numbers_task.s(backup)],
|
||||
)
|
||||
except Exception:
|
||||
restore_archive_serial_numbers(backup)
|
||||
raise
|
||||
else:
|
||||
consume_task.delay()
|
||||
chain(consume_task, delete.si(affected_docs)).delay()
|
||||
else:
|
||||
consume_task.delay()
|
||||
|
||||
return "OK"
|
||||
|
||||
@@ -545,8 +494,6 @@ def split(
|
||||
overrides.title = f"{doc.title} (split {idx + 1})"
|
||||
if user is not None:
|
||||
overrides.owner_id = user.id
|
||||
if not delete_originals:
|
||||
overrides.skip_asn_if_exists = True
|
||||
logger.info(
|
||||
f"Adding split document with pages {split_doc} to the task queue.",
|
||||
)
|
||||
@@ -561,20 +508,10 @@ def split(
|
||||
)
|
||||
|
||||
if delete_originals:
|
||||
backup = release_archive_serial_numbers([doc.id])
|
||||
logger.info(
|
||||
"Queueing removal of original document after consumption of the split documents",
|
||||
)
|
||||
try:
|
||||
chord(
|
||||
header=consume_tasks,
|
||||
body=delete.si([doc.id]),
|
||||
).apply_async(
|
||||
link_error=[restore_archive_serial_numbers_task.s(backup)],
|
||||
)
|
||||
except Exception:
|
||||
restore_archive_serial_numbers(backup)
|
||||
raise
|
||||
chord(header=consume_tasks, body=delete.si([doc.id])).delay()
|
||||
else:
|
||||
group(consume_tasks).delay()
|
||||
|
||||
@@ -677,10 +614,7 @@ def edit_pdf(
|
||||
)
|
||||
if user is not None:
|
||||
overrides.owner_id = user.id
|
||||
if not delete_original:
|
||||
overrides.skip_asn_if_exists = True
|
||||
if delete_original and len(pdf_docs) == 1:
|
||||
overrides.asn = doc.archive_serial_number
|
||||
|
||||
for idx, pdf in enumerate(pdf_docs, start=1):
|
||||
filepath: Path = (
|
||||
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
|
||||
@@ -699,17 +633,7 @@ def edit_pdf(
|
||||
)
|
||||
|
||||
if delete_original:
|
||||
backup = release_archive_serial_numbers([doc.id])
|
||||
try:
|
||||
chord(
|
||||
header=consume_tasks,
|
||||
body=delete.si([doc.id]),
|
||||
).apply_async(
|
||||
link_error=[restore_archive_serial_numbers_task.s(backup)],
|
||||
)
|
||||
except Exception:
|
||||
restore_archive_serial_numbers(backup)
|
||||
raise
|
||||
chord(header=consume_tasks, body=delete.si([doc.id])).delay()
|
||||
else:
|
||||
group(consume_tasks).delay()
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@ from documents.signals.handlers import run_workflows
|
||||
from documents.templating.workflows import parse_w_workflow_placeholders
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import normalize_nfc
|
||||
from documents.utils import run_subprocess
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
||||
@@ -111,7 +112,12 @@ class ConsumerPluginMixin:
|
||||
|
||||
self.renew_logging_group()
|
||||
|
||||
self.filename = self.metadata.filename or self.input_doc.original_file.name
|
||||
self.metadata.filename = normalize_nfc(self.metadata.filename)
|
||||
self.metadata.title = normalize_nfc(self.metadata.title)
|
||||
|
||||
self.filename = normalize_nfc(
|
||||
self.metadata.filename or self.input_doc.original_file.name,
|
||||
)
|
||||
|
||||
def _send_progress(
|
||||
self,
|
||||
@@ -652,6 +658,8 @@ class ConsumerPlugin(
|
||||
f"Error occurred parsing title override '{self.metadata.title}', falling back to original. Exception: {e}",
|
||||
)
|
||||
|
||||
title = normalize_nfc(title)
|
||||
|
||||
file_for_checksum = (
|
||||
self.unmodified_original
|
||||
if self.unmodified_original is not None
|
||||
@@ -696,7 +704,7 @@ class ConsumerPlugin(
|
||||
pk=self.metadata.storage_path_id,
|
||||
)
|
||||
|
||||
if self.metadata.asn is not None:
|
||||
if self.metadata.asn is not None and not self.metadata.skip_asn:
|
||||
document.archive_serial_number = self.metadata.asn
|
||||
|
||||
if self.metadata.owner_id:
|
||||
@@ -812,8 +820,8 @@ class ConsumerPreflightPlugin(
|
||||
"""
|
||||
Check that if override_asn is given, it is unique and within a valid range
|
||||
"""
|
||||
if self.metadata.asn is None:
|
||||
# if ASN is None
|
||||
if self.metadata.skip_asn or self.metadata.asn is None:
|
||||
# if skip is set or ASN is None
|
||||
return
|
||||
# Validate the range is above zero and less than uint32_t max
|
||||
# otherwise, Whoosh can't handle it in the index
|
||||
|
||||
@@ -30,7 +30,7 @@ class DocumentMetadataOverrides:
|
||||
change_users: list[int] | None = None
|
||||
change_groups: list[int] | None = None
|
||||
custom_fields: dict | None = None
|
||||
skip_asn_if_exists: bool = False
|
||||
skip_asn: bool = False
|
||||
|
||||
def update(self, other: "DocumentMetadataOverrides") -> "DocumentMetadataOverrides":
|
||||
"""
|
||||
@@ -50,8 +50,8 @@ class DocumentMetadataOverrides:
|
||||
self.storage_path_id = other.storage_path_id
|
||||
if other.owner_id is not None:
|
||||
self.owner_id = other.owner_id
|
||||
if other.skip_asn_if_exists:
|
||||
self.skip_asn_if_exists = True
|
||||
if other.skip_asn:
|
||||
self.skip_asn = True
|
||||
|
||||
# merge
|
||||
if self.tag_ids is None:
|
||||
|
||||
@@ -6,6 +6,7 @@ from django.conf import settings
|
||||
from documents.models import Document
|
||||
from documents.templating.filepath import validate_filepath_template_and_render
|
||||
from documents.templating.utils import convert_format_str_to_template_format
|
||||
from documents.utils import normalize_nfc
|
||||
|
||||
|
||||
def create_source_path_directory(source_path: Path) -> None:
|
||||
@@ -55,11 +56,11 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
|
||||
"""
|
||||
if archive_filename:
|
||||
old_filename: Path | None = (
|
||||
Path(doc.archive_filename) if doc.archive_filename else None
|
||||
Path(normalize_nfc(doc.archive_filename)) if doc.archive_filename else None
|
||||
)
|
||||
root = settings.ARCHIVE_DIR
|
||||
else:
|
||||
old_filename = Path(doc.filename) if doc.filename else None
|
||||
old_filename = Path(normalize_nfc(doc.filename)) if doc.filename else None
|
||||
root = settings.ORIGINALS_DIR
|
||||
|
||||
# If generating archive filenames, try to make a name that is similar to
|
||||
@@ -91,7 +92,7 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
|
||||
)
|
||||
if new_filename == old_filename:
|
||||
# still the same as before.
|
||||
return new_filename
|
||||
return Path(normalize_nfc(str(new_filename)))
|
||||
|
||||
if (root / new_filename).exists():
|
||||
counter += 1
|
||||
@@ -119,7 +120,7 @@ def format_filename(document: Document, template_str: str) -> str | None:
|
||||
"none",
|
||||
) # backward compatibility
|
||||
|
||||
return rendered_filename
|
||||
return normalize_nfc(rendered_filename)
|
||||
|
||||
|
||||
def generate_filename(
|
||||
@@ -174,4 +175,4 @@ def generate_filename(
|
||||
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
|
||||
full_path = full_path.with_suffix(full_path.suffix + ".gpg")
|
||||
|
||||
return full_path
|
||||
return Path(normalize_nfc(str(full_path)))
|
||||
|
||||
@@ -41,6 +41,7 @@ from documents.models import PaperlessTask
|
||||
from documents.models import ShareLink
|
||||
from documents.models import StoragePath
|
||||
from documents.models import Tag
|
||||
from documents.utils import normalize_nfc
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Callable
|
||||
@@ -162,7 +163,11 @@ class TitleContentFilter(Filter):
|
||||
def filter(self, qs, value):
|
||||
value = value.strip() if isinstance(value, str) else value
|
||||
if value:
|
||||
return qs.filter(Q(title__icontains=value) | Q(content__icontains=value))
|
||||
normalized = normalize_nfc(value) or ""
|
||||
folded = normalized.casefold()
|
||||
return qs.filter(
|
||||
Q(title__icontains=folded) | Q(content__icontains=folded),
|
||||
)
|
||||
else:
|
||||
return qs
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
import unicodedata
|
||||
from collections import Counter
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
@@ -58,6 +59,14 @@ if TYPE_CHECKING:
|
||||
logger = logging.getLogger("paperless.index")
|
||||
|
||||
|
||||
def _normalize_for_index(value: str | None) -> str | None:
|
||||
"""Normalize text to NFC for consistent search/index matching."""
|
||||
|
||||
if value is None:
|
||||
return None
|
||||
return unicodedata.normalize("NFC", value)
|
||||
|
||||
|
||||
def get_schema() -> Schema:
|
||||
return Schema(
|
||||
id=NUMERIC(stored=True, unique=True),
|
||||
@@ -163,37 +172,41 @@ def update_document(writer: AsyncWriter, doc: Document) -> None:
|
||||
viewer_ids: str = ",".join([str(u.id) for u in users_with_perms])
|
||||
writer.update_document(
|
||||
id=doc.pk,
|
||||
title=doc.title,
|
||||
content=doc.content,
|
||||
correspondent=doc.correspondent.name if doc.correspondent else None,
|
||||
title=_normalize_for_index(doc.title),
|
||||
content=_normalize_for_index(doc.content),
|
||||
correspondent=_normalize_for_index(
|
||||
doc.correspondent.name if doc.correspondent else None,
|
||||
),
|
||||
correspondent_id=doc.correspondent.id if doc.correspondent else None,
|
||||
has_correspondent=doc.correspondent is not None,
|
||||
tag=tags if tags else None,
|
||||
tag=_normalize_for_index(tags) if tags else None,
|
||||
tag_id=tags_ids if tags_ids else None,
|
||||
has_tag=len(tags) > 0,
|
||||
type=doc.document_type.name if doc.document_type else None,
|
||||
type=_normalize_for_index(
|
||||
doc.document_type.name if doc.document_type else None,
|
||||
),
|
||||
type_id=doc.document_type.id if doc.document_type else None,
|
||||
has_type=doc.document_type is not None,
|
||||
created=datetime.combine(doc.created, time.min),
|
||||
added=doc.added,
|
||||
asn=asn,
|
||||
modified=doc.modified,
|
||||
path=doc.storage_path.name if doc.storage_path else None,
|
||||
path=_normalize_for_index(doc.storage_path.name if doc.storage_path else None),
|
||||
path_id=doc.storage_path.id if doc.storage_path else None,
|
||||
has_path=doc.storage_path is not None,
|
||||
notes=notes,
|
||||
notes=_normalize_for_index(notes),
|
||||
num_notes=len(notes),
|
||||
custom_fields=custom_fields,
|
||||
custom_fields=_normalize_for_index(custom_fields),
|
||||
custom_field_count=len(doc.custom_fields.all()),
|
||||
has_custom_fields=len(custom_fields) > 0,
|
||||
custom_fields_id=custom_fields_ids if custom_fields_ids else None,
|
||||
owner=doc.owner.username if doc.owner else None,
|
||||
owner=_normalize_for_index(doc.owner.username if doc.owner else None),
|
||||
owner_id=doc.owner.id if doc.owner else None,
|
||||
has_owner=doc.owner is not None,
|
||||
viewer_id=viewer_ids if viewer_ids else None,
|
||||
checksum=doc.checksum,
|
||||
page_count=doc.page_count,
|
||||
original_filename=doc.original_filename,
|
||||
original_filename=_normalize_for_index(doc.original_filename),
|
||||
is_shared=len(viewer_ids) > 0,
|
||||
)
|
||||
logger.debug(f"Index updated for document {doc.pk}.")
|
||||
@@ -421,7 +434,7 @@ class LocalDateParser(English):
|
||||
|
||||
class DelayedFullTextQuery(DelayedQuery):
|
||||
def _get_query(self) -> tuple:
|
||||
q_str = self.query_params["query"]
|
||||
q_str = _normalize_for_index(self.query_params["query"]) or ""
|
||||
q_str = rewrite_natural_date_keywords(q_str)
|
||||
qp = MultifieldParser(
|
||||
[
|
||||
@@ -460,7 +473,12 @@ class DelayedFullTextQuery(DelayedQuery):
|
||||
class DelayedMoreLikeThisQuery(DelayedQuery):
|
||||
def _get_query(self) -> tuple:
|
||||
more_like_doc_id = int(self.query_params["more_like_id"])
|
||||
content = Document.objects.get(id=more_like_doc_id).content
|
||||
content = (
|
||||
_normalize_for_index(
|
||||
Document.objects.get(id=more_like_doc_id).content,
|
||||
)
|
||||
or ""
|
||||
)
|
||||
|
||||
docnum = self.searcher.document_number(id=more_like_doc_id)
|
||||
kts = self.searcher.key_terms_from_text(
|
||||
@@ -488,6 +506,7 @@ def autocomplete(
|
||||
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
|
||||
and without scoring
|
||||
"""
|
||||
term = _normalize_for_index(term) or ""
|
||||
terms = []
|
||||
|
||||
with ix.searcher(weighting=TF_IDF()) as s:
|
||||
|
||||
@@ -2,10 +2,12 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
from fnmatch import fnmatch
|
||||
from fnmatch import translate as fnmatch_translate
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.db.models import Q
|
||||
from rest_framework import serializers
|
||||
|
||||
from documents.data_models import ConsumableDocument
|
||||
@@ -21,6 +23,7 @@ from documents.models import Workflow
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.permissions import get_objects_for_user_owner_aware
|
||||
from documents.regex import safe_regex_search
|
||||
from documents.utils import normalize_nfc
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
@@ -30,6 +33,34 @@ if TYPE_CHECKING:
|
||||
logger = logging.getLogger("paperless.matching")
|
||||
|
||||
|
||||
def _normalize_glob_value(value: str) -> str:
|
||||
"""Normalize strings for glob-style matching (case-insensitive)."""
|
||||
|
||||
return (normalize_nfc(value) or "").casefold()
|
||||
|
||||
|
||||
def _normalized_fnmatch(name: str, pattern: str) -> bool:
|
||||
"""Canonicalize Unicode and compare using fnmatch semantics."""
|
||||
|
||||
return fnmatch(_normalize_glob_value(name), _normalize_glob_value(pattern))
|
||||
|
||||
|
||||
def _glob_regex_variants(pattern: str) -> list[str]:
|
||||
"""
|
||||
Build regex patterns that match both NFC and NFD forms of a glob pattern.
|
||||
Using both forms lets DB prefilters remain Unicode-normalization agnostic.
|
||||
"""
|
||||
|
||||
regexes = set()
|
||||
for normalized in {
|
||||
normalize_nfc(pattern) or "",
|
||||
unicodedata.normalize("NFD", pattern),
|
||||
}:
|
||||
regex = fnmatch_translate(normalized).lstrip("^").rstrip("$")
|
||||
regexes.add(regex)
|
||||
return list(regexes)
|
||||
|
||||
|
||||
def log_reason(
|
||||
matching_model: MatchingModel | WorkflowTrigger,
|
||||
document: Document,
|
||||
@@ -305,9 +336,9 @@ def consumable_document_matches_workflow(
|
||||
if (
|
||||
trigger.filter_filename is not None
|
||||
and len(trigger.filter_filename) > 0
|
||||
and not fnmatch(
|
||||
document.original_file.name.lower(),
|
||||
trigger.filter_filename.lower(),
|
||||
and not _normalized_fnmatch(
|
||||
document.original_file.name,
|
||||
trigger.filter_filename,
|
||||
)
|
||||
):
|
||||
reason = (
|
||||
@@ -328,7 +359,7 @@ def consumable_document_matches_workflow(
|
||||
if (
|
||||
trigger.filter_path is not None
|
||||
and len(trigger.filter_path) > 0
|
||||
and not fnmatch(
|
||||
and not _normalized_fnmatch(
|
||||
match_against,
|
||||
trigger.filter_path,
|
||||
)
|
||||
@@ -492,9 +523,9 @@ def existing_document_matches_workflow(
|
||||
trigger.filter_filename is not None
|
||||
and len(trigger.filter_filename) > 0
|
||||
and document.original_filename is not None
|
||||
and not fnmatch(
|
||||
document.original_filename.lower(),
|
||||
trigger.filter_filename.lower(),
|
||||
and not _normalized_fnmatch(
|
||||
document.original_filename,
|
||||
trigger.filter_filename,
|
||||
)
|
||||
):
|
||||
return (
|
||||
@@ -573,8 +604,11 @@ def prefilter_documents_by_workflowtrigger(
|
||||
documents = documents.annotate(**annotations).filter(custom_field_q)
|
||||
|
||||
if trigger.filter_filename:
|
||||
regex = fnmatch_translate(trigger.filter_filename).lstrip("^").rstrip("$")
|
||||
documents = documents.filter(original_filename__iregex=regex)
|
||||
regexes = _glob_regex_variants(trigger.filter_filename)
|
||||
filename_q = Q()
|
||||
for regex in regexes:
|
||||
filename_q |= Q(original_filename__iregex=regex)
|
||||
documents = documents.filter(filename_q)
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
@@ -89,6 +89,23 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(len(results), 0)
|
||||
self.assertCountEqual(response.data["all"], [])
|
||||
|
||||
def test_search_handles_diacritics_normalization(self):
|
||||
doc = Document.objects.create(
|
||||
title="certida\u0303o de nascimento",
|
||||
content="birth record without keyword",
|
||||
checksum="D",
|
||||
pk=10,
|
||||
)
|
||||
with AsyncWriter(index.open_index()) as writer:
|
||||
index.update_document(writer, doc)
|
||||
|
||||
response = self.client.get("/api/documents/?query=certidão")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
results = response.data["results"]
|
||||
self.assertEqual(response.data["count"], 1)
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0]["id"], doc.id)
|
||||
|
||||
def test_search_custom_field_ordering(self):
|
||||
custom_field = CustomField.objects.create(
|
||||
name="Sortable field",
|
||||
|
||||
@@ -602,21 +602,23 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
expected_filename,
|
||||
)
|
||||
self.assertEqual(consume_file_args[1].title, None)
|
||||
# No metadata_document_id, delete_originals False, so ASN should be None
|
||||
self.assertIsNone(consume_file_args[1].asn)
|
||||
self.assertTrue(consume_file_args[1].skip_asn)
|
||||
|
||||
# With metadata_document_id overrides
|
||||
result = bulk_edit.merge(doc_ids, metadata_document_id=metadata_document_id)
|
||||
consume_file_args, _ = mock_consume_file.call_args
|
||||
self.assertEqual(consume_file_args[1].title, "B (merged)")
|
||||
self.assertEqual(consume_file_args[1].created, self.doc2.created)
|
||||
self.assertTrue(consume_file_args[1].skip_asn)
|
||||
|
||||
self.assertEqual(result, "OK")
|
||||
|
||||
@mock.patch("documents.bulk_edit.delete.si")
|
||||
@mock.patch("documents.tasks.consume_file.s")
|
||||
@mock.patch("documents.bulk_edit.chain")
|
||||
def test_merge_and_delete_originals(
|
||||
self,
|
||||
mock_chain,
|
||||
mock_consume_file,
|
||||
mock_delete_documents,
|
||||
):
|
||||
@@ -630,12 +632,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
- Document deletion task should be called
|
||||
"""
|
||||
doc_ids = [self.doc1.id, self.doc2.id, self.doc3.id]
|
||||
self.doc1.archive_serial_number = 101
|
||||
self.doc2.archive_serial_number = 102
|
||||
self.doc3.archive_serial_number = 103
|
||||
self.doc1.save()
|
||||
self.doc2.save()
|
||||
self.doc3.save()
|
||||
|
||||
result = bulk_edit.merge(doc_ids, delete_originals=True)
|
||||
self.assertEqual(result, "OK")
|
||||
@@ -646,8 +642,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
|
||||
mock_consume_file.assert_called()
|
||||
mock_delete_documents.assert_called()
|
||||
consume_sig = mock_consume_file.return_value
|
||||
consume_sig.apply_async.assert_called_once()
|
||||
mock_chain.assert_called_once()
|
||||
|
||||
consume_file_args, _ = mock_consume_file.call_args
|
||||
self.assertEqual(
|
||||
@@ -655,7 +650,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
expected_filename,
|
||||
)
|
||||
self.assertEqual(consume_file_args[1].title, None)
|
||||
self.assertEqual(consume_file_args[1].asn, 101)
|
||||
self.assertTrue(consume_file_args[1].skip_asn)
|
||||
|
||||
delete_documents_args, _ = mock_delete_documents.call_args
|
||||
self.assertEqual(
|
||||
@@ -663,92 +658,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
doc_ids,
|
||||
)
|
||||
|
||||
self.doc1.refresh_from_db()
|
||||
self.doc2.refresh_from_db()
|
||||
self.doc3.refresh_from_db()
|
||||
self.assertIsNone(self.doc1.archive_serial_number)
|
||||
self.assertIsNone(self.doc2.archive_serial_number)
|
||||
self.assertIsNone(self.doc3.archive_serial_number)
|
||||
|
||||
@mock.patch("documents.bulk_edit.delete.si")
|
||||
@mock.patch("documents.tasks.consume_file.s")
|
||||
def test_merge_and_delete_originals_restore_on_failure(
|
||||
self,
|
||||
mock_consume_file,
|
||||
mock_delete_documents,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing documents
|
||||
WHEN:
|
||||
- Merge action with deleting documents is called with 1 document
|
||||
- Error occurs when queuing consume file task
|
||||
THEN:
|
||||
- Archive serial numbers are restored
|
||||
"""
|
||||
doc_ids = [self.doc1.id]
|
||||
self.doc1.archive_serial_number = 111
|
||||
self.doc1.save()
|
||||
sig = mock.Mock()
|
||||
sig.apply_async.side_effect = Exception("boom")
|
||||
mock_consume_file.return_value = sig
|
||||
|
||||
with self.assertRaises(Exception):
|
||||
bulk_edit.merge(doc_ids, delete_originals=True)
|
||||
|
||||
self.doc1.refresh_from_db()
|
||||
self.assertEqual(self.doc1.archive_serial_number, 111)
|
||||
|
||||
@mock.patch("documents.bulk_edit.delete.si")
|
||||
@mock.patch("documents.tasks.consume_file.s")
|
||||
def test_merge_and_delete_originals_metadata_handoff(
|
||||
self,
|
||||
mock_consume_file,
|
||||
mock_delete_documents,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing documents with ASNs
|
||||
WHEN:
|
||||
- Merge with delete_originals=True and metadata_document_id set
|
||||
THEN:
|
||||
- Handoff ASN uses metadata document ASN
|
||||
"""
|
||||
doc_ids = [self.doc1.id, self.doc2.id]
|
||||
self.doc1.archive_serial_number = 101
|
||||
self.doc2.archive_serial_number = 202
|
||||
self.doc1.save()
|
||||
self.doc2.save()
|
||||
|
||||
result = bulk_edit.merge(
|
||||
doc_ids,
|
||||
metadata_document_id=self.doc2.id,
|
||||
delete_originals=True,
|
||||
)
|
||||
self.assertEqual(result, "OK")
|
||||
|
||||
consume_file_args, _ = mock_consume_file.call_args
|
||||
self.assertEqual(consume_file_args[1].asn, 202)
|
||||
|
||||
def test_restore_archive_serial_numbers_task(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing document with no archive serial number
|
||||
WHEN:
|
||||
- Restore archive serial number task is called with backup data
|
||||
THEN:
|
||||
- Document archive serial number is restored
|
||||
"""
|
||||
self.doc1.archive_serial_number = 444
|
||||
self.doc1.save()
|
||||
Document.objects.filter(pk=self.doc1.id).update(archive_serial_number=None)
|
||||
|
||||
backup = {self.doc1.id: 444}
|
||||
bulk_edit.restore_archive_serial_numbers_task(backup)
|
||||
|
||||
self.doc1.refresh_from_db()
|
||||
self.assertEqual(self.doc1.archive_serial_number, 444)
|
||||
|
||||
@mock.patch("documents.tasks.consume_file.s")
|
||||
def test_merge_with_archive_fallback(self, mock_consume_file):
|
||||
"""
|
||||
@@ -817,7 +726,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(mock_consume_file.call_count, 2)
|
||||
consume_file_args, _ = mock_consume_file.call_args
|
||||
self.assertEqual(consume_file_args[1].title, "B (split 2)")
|
||||
self.assertIsNone(consume_file_args[1].asn)
|
||||
|
||||
self.assertEqual(result, "OK")
|
||||
|
||||
@@ -842,8 +750,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
"""
|
||||
doc_ids = [self.doc2.id]
|
||||
pages = [[1, 2], [3]]
|
||||
self.doc2.archive_serial_number = 200
|
||||
self.doc2.save()
|
||||
|
||||
result = bulk_edit.split(doc_ids, pages, delete_originals=True)
|
||||
self.assertEqual(result, "OK")
|
||||
@@ -861,42 +767,6 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
doc_ids,
|
||||
)
|
||||
|
||||
self.doc2.refresh_from_db()
|
||||
self.assertIsNone(self.doc2.archive_serial_number)
|
||||
|
||||
@mock.patch("documents.bulk_edit.delete.si")
|
||||
@mock.patch("documents.tasks.consume_file.s")
|
||||
@mock.patch("documents.bulk_edit.chord")
|
||||
def test_split_restore_on_failure(
|
||||
self,
|
||||
mock_chord,
|
||||
mock_consume_file,
|
||||
mock_delete_documents,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing documents
|
||||
WHEN:
|
||||
- Split action with deleting documents is called with 1 document and 2 page groups
|
||||
- Error occurs when queuing chord task
|
||||
THEN:
|
||||
- Archive serial numbers are restored
|
||||
"""
|
||||
doc_ids = [self.doc2.id]
|
||||
pages = [[1, 2]]
|
||||
self.doc2.archive_serial_number = 222
|
||||
self.doc2.save()
|
||||
|
||||
sig = mock.Mock()
|
||||
sig.apply_async.side_effect = Exception("boom")
|
||||
mock_chord.return_value = sig
|
||||
|
||||
result = bulk_edit.split(doc_ids, pages, delete_originals=True)
|
||||
self.assertEqual(result, "OK")
|
||||
|
||||
self.doc2.refresh_from_db()
|
||||
self.assertEqual(self.doc2.archive_serial_number, 222)
|
||||
|
||||
@mock.patch("documents.tasks.consume_file.delay")
|
||||
@mock.patch("pikepdf.Pdf.save")
|
||||
def test_split_with_errors(self, mock_save_pdf, mock_consume_file):
|
||||
@@ -1097,49 +967,10 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
mock_chord.return_value.delay.return_value = None
|
||||
doc_ids = [self.doc2.id]
|
||||
operations = [{"page": 1}, {"page": 2}]
|
||||
self.doc2.archive_serial_number = 250
|
||||
self.doc2.save()
|
||||
|
||||
result = bulk_edit.edit_pdf(doc_ids, operations, delete_original=True)
|
||||
self.assertEqual(result, "OK")
|
||||
mock_chord.assert_called_once()
|
||||
consume_file_args, _ = mock_consume_file.call_args
|
||||
self.assertEqual(consume_file_args[1].asn, 250)
|
||||
self.doc2.refresh_from_db()
|
||||
self.assertIsNone(self.doc2.archive_serial_number)
|
||||
|
||||
@mock.patch("documents.bulk_edit.delete.si")
|
||||
@mock.patch("documents.tasks.consume_file.s")
|
||||
@mock.patch("documents.bulk_edit.chord")
|
||||
def test_edit_pdf_restore_on_failure(
|
||||
self,
|
||||
mock_chord,
|
||||
mock_consume_file,
|
||||
mock_delete_documents,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing document
|
||||
WHEN:
|
||||
- edit_pdf is called with delete_original=True
|
||||
- Error occurs when queuing chord task
|
||||
THEN:
|
||||
- Archive serial numbers are restored
|
||||
"""
|
||||
doc_ids = [self.doc2.id]
|
||||
operations = [{"page": 1}]
|
||||
self.doc2.archive_serial_number = 333
|
||||
self.doc2.save()
|
||||
|
||||
sig = mock.Mock()
|
||||
sig.apply_async.side_effect = Exception("boom")
|
||||
mock_chord.return_value = sig
|
||||
|
||||
with self.assertRaises(Exception):
|
||||
bulk_edit.edit_pdf(doc_ids, operations, delete_original=True)
|
||||
|
||||
self.doc2.refresh_from_db()
|
||||
self.assertEqual(self.doc2.archive_serial_number, 333)
|
||||
|
||||
@mock.patch("documents.tasks.update_document_content_maybe_archive_file.delay")
|
||||
def test_edit_pdf_with_update_document(self, mock_update_document):
|
||||
|
||||
@@ -14,7 +14,6 @@ from django.test import override_settings
|
||||
from django.utils import timezone
|
||||
from guardian.core import ObjectPermissionChecker
|
||||
|
||||
from documents.barcodes import BarcodePlugin
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.data_models import DocumentSource
|
||||
@@ -291,6 +290,23 @@ class TestConsumer(
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def test_override_filename_normalized(self):
|
||||
filename = self.get_test_file()
|
||||
override_filename = "Inhaltsu\u0308bersicht.pdf"
|
||||
|
||||
with self.get_consumer(
|
||||
filename,
|
||||
DocumentMetadataOverrides(filename=override_filename),
|
||||
) as consumer:
|
||||
consumer.run()
|
||||
|
||||
document = Document.objects.first()
|
||||
|
||||
self.assertIsNotNone(document)
|
||||
self.assertEqual(document.original_filename, "Inhaltsübersicht.pdf")
|
||||
self.assertEqual(document.title, "Inhaltsübersicht")
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testOverrideTitle(self):
|
||||
with self.get_consumer(
|
||||
self.get_test_file(),
|
||||
@@ -305,6 +321,25 @@ class TestConsumer(
|
||||
self.assertEqual(document.title, "Override Title")
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{{ title }}")
|
||||
def test_filename_format_normalized(self):
|
||||
filename = self.get_test_file()
|
||||
title = "Inhaltsu\u0308bersicht Faszination"
|
||||
|
||||
with self.get_consumer(
|
||||
filename,
|
||||
DocumentMetadataOverrides(title=title),
|
||||
) as consumer:
|
||||
consumer.run()
|
||||
|
||||
document = Document.objects.first()
|
||||
|
||||
self.assertIsNotNone(document)
|
||||
self.assertEqual(document.title, "Inhaltsübersicht Faszination")
|
||||
self.assertEqual(document.filename, "Inhaltsübersicht Faszination.pdf")
|
||||
self.assertIsFile(document.source_path)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testOverrideCorrespondent(self):
|
||||
c = Correspondent.objects.create(name="test")
|
||||
|
||||
@@ -413,6 +448,14 @@ class TestConsumer(
|
||||
self.assertEqual(document.archive_serial_number, 123)
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
def testMetadataOverridesSkipAsnPropagation(self):
|
||||
overrides = DocumentMetadataOverrides()
|
||||
incoming = DocumentMetadataOverrides(skip_asn=True)
|
||||
|
||||
overrides.update(incoming)
|
||||
|
||||
self.assertTrue(overrides.skip_asn)
|
||||
|
||||
def testOverrideTitlePlaceholders(self):
|
||||
c = Correspondent.objects.create(name="Correspondent Name")
|
||||
dt = DocumentType.objects.create(name="DocType Name")
|
||||
@@ -1233,46 +1276,3 @@ class PostConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
|
||||
r"sample\.pdf: Error while executing post-consume script: Command '\[.*\]' returned non-zero exit status \d+\.",
|
||||
):
|
||||
consumer.run_post_consume_script(doc)
|
||||
|
||||
|
||||
class TestMetadataOverrides(TestCase):
|
||||
def test_update_skip_asn_if_exists(self):
|
||||
base = DocumentMetadataOverrides()
|
||||
incoming = DocumentMetadataOverrides(skip_asn_if_exists=True)
|
||||
base.update(incoming)
|
||||
self.assertTrue(base.skip_asn_if_exists)
|
||||
|
||||
|
||||
class TestBarcodeApplyDetectedASN(TestCase):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing Documents with ASN 123
|
||||
WHEN:
|
||||
- A BarcodePlugin which detected an ASN
|
||||
THEN:
|
||||
- If skip_asn_if_exists is set, and ASN exists, do not set ASN
|
||||
- If skip_asn_if_exists is set, and ASN does not exist, set ASN
|
||||
"""
|
||||
|
||||
def test_apply_detected_asn_skips_existing_when_flag_set(self):
|
||||
doc = Document.objects.create(
|
||||
checksum="X1",
|
||||
title="D1",
|
||||
archive_serial_number=123,
|
||||
)
|
||||
metadata = DocumentMetadataOverrides(skip_asn_if_exists=True)
|
||||
plugin = BarcodePlugin(
|
||||
input_doc=mock.Mock(),
|
||||
metadata=metadata,
|
||||
status_mgr=mock.Mock(),
|
||||
base_tmp_dir=tempfile.gettempdir(),
|
||||
task_id="test-task",
|
||||
)
|
||||
|
||||
plugin._apply_detected_asn(123)
|
||||
self.assertIsNone(plugin.metadata.asn)
|
||||
|
||||
doc.hard_delete()
|
||||
|
||||
plugin._apply_detected_asn(123)
|
||||
self.assertEqual(plugin.metadata.asn, 123)
|
||||
|
||||
@@ -557,6 +557,50 @@ class TestWorkflows(
|
||||
expected_str = f"Document filename {test_file.name} does not match"
|
||||
self.assertIn(expected_str, cm.output[1])
|
||||
|
||||
def test_workflow_match_filename_diacritics_normalized(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Consumption workflow filtering on filename with diacritics
|
||||
WHEN:
|
||||
- File with decomposed Unicode filename is consumed
|
||||
THEN:
|
||||
- Workflow still matches and applies overrides
|
||||
"""
|
||||
trigger = WorkflowTrigger.objects.create(
|
||||
type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
|
||||
sources=f"{DocumentSource.ApiUpload},{DocumentSource.ConsumeFolder},{DocumentSource.MailFetch}",
|
||||
filter_filename="*račun*",
|
||||
)
|
||||
action = WorkflowAction.objects.create(
|
||||
assign_title="Diacritics matched",
|
||||
)
|
||||
action.save()
|
||||
|
||||
w = Workflow.objects.create(
|
||||
name="Workflow 1",
|
||||
order=0,
|
||||
)
|
||||
w.triggers.add(trigger)
|
||||
w.actions.add(action)
|
||||
w.save()
|
||||
|
||||
decomposed_name = "rac\u030cun.pdf"
|
||||
test_file = shutil.copy(
|
||||
self.SAMPLE_DIR / "simple.pdf",
|
||||
self.dirs.scratch_dir / decomposed_name,
|
||||
)
|
||||
|
||||
with mock.patch("documents.tasks.ProgressManager", DummyProgressManager):
|
||||
tasks.consume_file(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=test_file,
|
||||
),
|
||||
None,
|
||||
)
|
||||
document = Document.objects.first()
|
||||
self.assertEqual(document.title, "Diacritics matched")
|
||||
|
||||
def test_workflow_no_match_path(self):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -946,6 +990,35 @@ class TestWorkflows(
|
||||
self.assertEqual(doc.correspondent, self.c2)
|
||||
self.assertEqual(doc.title, f"Doc created in {created.year}")
|
||||
|
||||
def test_document_added_filename_diacritics_normalized(self):
|
||||
trigger = WorkflowTrigger.objects.create(
|
||||
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
||||
filter_filename="*račun*",
|
||||
)
|
||||
action = WorkflowAction.objects.create(
|
||||
assign_title="Matched diacritics",
|
||||
)
|
||||
w = Workflow.objects.create(
|
||||
name="Workflow 1",
|
||||
order=0,
|
||||
)
|
||||
w.triggers.add(trigger)
|
||||
w.actions.add(action)
|
||||
w.save()
|
||||
|
||||
doc = Document.objects.create(
|
||||
title="sample test",
|
||||
correspondent=self.c,
|
||||
original_filename="rac\u030cun.pdf",
|
||||
)
|
||||
|
||||
document_consumption_finished.send(
|
||||
sender=self.__class__,
|
||||
document=doc,
|
||||
)
|
||||
|
||||
self.assertEqual(doc.title, "Matched diacritics")
|
||||
|
||||
def test_document_added_no_match_filename(self):
|
||||
trigger = WorkflowTrigger.objects.create(
|
||||
type=WorkflowTrigger.WorkflowTriggerType.DOCUMENT_ADDED,
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import logging
|
||||
import shutil
|
||||
import unicodedata
|
||||
from os import PathLike
|
||||
from os import utime
|
||||
from pathlib import Path
|
||||
from subprocess import CompletedProcess
|
||||
@@ -16,6 +18,14 @@ def _coerce_to_path(
|
||||
return Path(source).resolve(), Path(dest).resolve()
|
||||
|
||||
|
||||
def normalize_nfc(value: str | PathLike[str] | None) -> str | None:
|
||||
"""Return NFC-normalized string for filesystem-safe comparisons."""
|
||||
|
||||
if value is None:
|
||||
return None
|
||||
return unicodedata.normalize("NFC", str(value))
|
||||
|
||||
|
||||
def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None:
|
||||
"""
|
||||
Copies only the m_time and a_time attributes from source to destination.
|
||||
|
||||
Reference in New Issue
Block a user