mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-24 01:02:45 -05:00
![dependabot[bot]](/assets/img/avatar_default.png)
* Chore(deps): Bump the django group across 1 directory with 9 updates Bumps the django group with 9 updates in the / directory: | Package | From | To | | --- | --- | --- | | [django](https://github.com/django/django) | `5.1.8` | `5.2.5` | | [django-auditlog](https://github.com/jazzband/django-auditlog) | `3.1.2` | `3.2.1` | | [django-guardian](https://github.com/django-guardian/django-guardian) | `2.4.0` | `3.0.3` | | [django-multiselectfield](https://github.com/goinnn/django-multiselectfield) | `0.1.13` | `1.0.1` | | [django-soft-delete](https://github.com/san4ezy/django_softdelete) | `1.0.18` | `1.0.19` | | [djangorestframework](https://github.com/encode/django-rest-framework) | `3.16.0` | `3.16.1` | | [djangorestframework-guardian](https://github.com/rpkilby/django-rest-framework-guardian) | `0.3.0` | `0.4.0` | | [drf-spectacular-sidecar](https://github.com/tfranzel/drf-spectacular-sidecar) | `2025.4.1` | `2025.8.1` | | [pytest-django](https://github.com/pytest-dev/pytest-django) | `4.10.0` | `4.11.1` | Updates `django` from 5.1.8 to 5.2.5 - [Commits](https://github.com/django/django/compare/5.1.8...5.2.5) Updates `django-auditlog` from 3.1.2 to 3.2.1 - [Release notes](https://github.com/jazzband/django-auditlog/releases) - [Changelog](https://github.com/jazzband/django-auditlog/blob/master/CHANGELOG.md) - [Commits](https://github.com/jazzband/django-auditlog/compare/v3.1.2...v3.2.1) Updates `django-guardian` from 2.4.0 to 3.0.3 - [Release notes](https://github.com/django-guardian/django-guardian/releases) - [Commits](https://github.com/django-guardian/django-guardian/compare/v2.4.0...3.0.3) Updates `django-multiselectfield` from 0.1.13 to 1.0.1 - [Release notes](https://github.com/goinnn/django-multiselectfield/releases) - [Changelog](https://github.com/goinnn/django-multiselectfield/blob/master/CHANGES.rst) - [Commits](https://github.com/goinnn/django-multiselectfield/compare/v0.1.13...v1.0.1) Updates `django-soft-delete` from 1.0.18 to 1.0.19 - [Changelog](https://github.com/san4ezy/django_softdelete/blob/master/CHANGELOG.md) - [Commits](https://github.com/san4ezy/django_softdelete/commits) Updates `djangorestframework` from 3.16.0 to 3.16.1 - [Release notes](https://github.com/encode/django-rest-framework/releases) - [Commits](https://github.com/encode/django-rest-framework/compare/3.16.0...3.16.1) Updates `djangorestframework-guardian` from 0.3.0 to 0.4.0 - [Changelog](https://github.com/rpkilby/django-rest-framework-guardian/blob/master/CHANGELOG) - [Commits](https://github.com/rpkilby/django-rest-framework-guardian/compare/0.3.0...0.4.0) Updates `drf-spectacular-sidecar` from 2025.4.1 to 2025.8.1 - [Commits](https://github.com/tfranzel/drf-spectacular-sidecar/compare/2025.4.1...2025.8.1) Updates `pytest-django` from 4.10.0 to 4.11.1 - [Release notes](https://github.com/pytest-dev/pytest-django/releases) - [Changelog](https://github.com/pytest-dev/pytest-django/blob/main/docs/changelog.rst) - [Commits](https://github.com/pytest-dev/pytest-django/compare/v4.10.0...v4.11.1) --- updated-dependencies: - dependency-name: django dependency-version: 5.2.5 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: django - dependency-name: django-auditlog dependency-version: 3.2.1 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: django - dependency-name: django-guardian dependency-version: 3.0.3 dependency-type: direct:production update-type: version-update:semver-major dependency-group: django - dependency-name: django-multiselectfield dependency-version: 1.0.1 dependency-type: direct:production update-type: version-update:semver-major dependency-group: django - dependency-name: django-soft-delete dependency-version: 1.0.19 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: django - dependency-name: djangorestframework dependency-version: 3.16.1 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: django - dependency-name: djangorestframework-guardian dependency-version: 0.4.0 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: django - dependency-name: drf-spectacular-sidecar dependency-version: 2025.8.1 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: django - dependency-name: pytest-django dependency-version: 4.11.1 dependency-type: direct:production update-type: version-update:semver-minor dependency-group: django ... Signed-off-by: dependabot[bot] <support@github.com> * Fix log matches related to newlines, add newlines to stdout.writelines * Fix disable api remote auth test, Django 5.2 no longer uses process_request * Remove postgres version check * Update administration.md * Handle django-multiselectfield v1.0 changes * Update administration.md --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
147 lines
5.0 KiB
Python
147 lines
5.0 KiB
Python
import dataclasses
|
|
import multiprocessing
|
|
from typing import Final
|
|
|
|
import rapidfuzz
|
|
import tqdm
|
|
from django.core.management import BaseCommand
|
|
from django.core.management import CommandError
|
|
|
|
from documents.management.commands.mixins import MultiProcessMixin
|
|
from documents.management.commands.mixins import ProgressBarMixin
|
|
from documents.models import Document
|
|
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class _WorkPackage:
|
|
first_doc: Document
|
|
second_doc: Document
|
|
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
class _WorkResult:
|
|
doc_one_pk: int
|
|
doc_two_pk: int
|
|
ratio: float
|
|
|
|
def __lt__(self, other: "_WorkResult") -> bool:
|
|
return self.doc_one_pk < other.doc_one_pk
|
|
|
|
|
|
def _process_and_match(work: _WorkPackage) -> _WorkResult:
|
|
"""
|
|
Does basic processing of document content, gets the basic ratio
|
|
and returns the result package
|
|
"""
|
|
# Normalize the string some, lower case, whitespace, etc
|
|
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
|
|
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
|
|
|
|
# Basic matching ratio
|
|
match = rapidfuzz.fuzz.ratio(first_string, second_string)
|
|
|
|
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
|
|
|
|
|
|
class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
|
|
help = "Searches for documents where the content almost matches"
|
|
|
|
def add_arguments(self, parser):
|
|
parser.add_argument(
|
|
"--ratio",
|
|
default=85.0,
|
|
type=float,
|
|
help="Ratio to consider documents a match",
|
|
)
|
|
parser.add_argument(
|
|
"--delete",
|
|
default=False,
|
|
action="store_true",
|
|
help="If set, one document of matches above the ratio WILL BE DELETED",
|
|
)
|
|
self.add_argument_progress_bar_mixin(parser)
|
|
self.add_argument_processes_mixin(parser)
|
|
|
|
def handle(self, *args, **options):
|
|
RATIO_MIN: Final[float] = 0.0
|
|
RATIO_MAX: Final[float] = 100.0
|
|
|
|
self.handle_processes_mixin(**options)
|
|
self.handle_progress_bar_mixin(**options)
|
|
|
|
if options["delete"]:
|
|
self.stdout.write(
|
|
self.style.WARNING(
|
|
"The command is configured to delete documents. Use with caution",
|
|
),
|
|
)
|
|
|
|
opt_ratio = options["ratio"]
|
|
checked_pairs: set[tuple[int, int]] = set()
|
|
work_pkgs: list[_WorkPackage] = []
|
|
|
|
# Ratio is a float from 0.0 to 100.0
|
|
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
|
|
raise CommandError("The ratio must be between 0 and 100")
|
|
|
|
all_docs = Document.objects.all().order_by("id")
|
|
|
|
# Build work packages for processing
|
|
for first_doc in all_docs:
|
|
for second_doc in all_docs:
|
|
# doc to doc is obviously not useful
|
|
if first_doc.pk == second_doc.pk:
|
|
continue
|
|
# Skip matching which have already been matched together
|
|
# doc 1 to doc 2 is the same as doc 2 to doc 1
|
|
doc_1_to_doc_2 = (first_doc.pk, second_doc.pk)
|
|
doc_2_to_doc_1 = doc_1_to_doc_2[::-1]
|
|
if doc_1_to_doc_2 in checked_pairs or doc_2_to_doc_1 in checked_pairs:
|
|
continue
|
|
checked_pairs.update([doc_1_to_doc_2, doc_2_to_doc_1])
|
|
# Actually something useful to work on now
|
|
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
|
|
|
# Don't spin up a pool of 1 process
|
|
if self.process_count == 1:
|
|
results = []
|
|
for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
|
|
results.append(_process_and_match(work))
|
|
else: # pragma: no cover
|
|
with multiprocessing.Pool(processes=self.process_count) as pool:
|
|
results = list(
|
|
tqdm.tqdm(
|
|
pool.imap_unordered(_process_and_match, work_pkgs),
|
|
total=len(work_pkgs),
|
|
disable=self.no_progress_bar,
|
|
),
|
|
)
|
|
|
|
# Check results
|
|
messages = []
|
|
maybe_delete_ids = []
|
|
for result in sorted(results):
|
|
if result.ratio >= opt_ratio:
|
|
messages.append(
|
|
self.style.NOTICE(
|
|
f"Document {result.doc_one_pk} fuzzy match"
|
|
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})\n",
|
|
),
|
|
)
|
|
maybe_delete_ids.append(result.doc_two_pk)
|
|
|
|
if len(messages) == 0:
|
|
messages.append(
|
|
self.style.SUCCESS("No matches found\n"),
|
|
)
|
|
self.stdout.writelines(
|
|
messages,
|
|
)
|
|
if options["delete"]:
|
|
self.stdout.write(
|
|
self.style.NOTICE(
|
|
f"Deleting {len(maybe_delete_ids)} documents based on ratio matches",
|
|
),
|
|
)
|
|
Document.objects.filter(pk__in=maybe_delete_ids).delete()
|