Add to handler, matching, retagger

This commit is contained in:
shamoon 2024-12-13 13:54:20 -08:00
parent a632b6b711
commit 6dc6c6c7bb
No known key found for this signature in database
5 changed files with 110 additions and 4 deletions

View File

@ -372,17 +372,19 @@ currently-imported docs. This problem is common enough that there are
tools for it.
```
document_retagger [-h] [-c] [-T] [-t] [-i] [--id-range] [--use-first] [-f]
document_retagger [-h] [-c] [-T] [-t] [-cf] [-i] [--id-range] [--use-first] [-f] [--suggest]
optional arguments:
-c, --correspondent
-T, --tags
-t, --document_type
-s, --storage_path
-cf, --custom_fields
-i, --inbox-only
--id-range
--use-first
-f, --overwrite
--suggest
```
Run this after changing or adding matching rules. It'll loop over all
@ -408,6 +410,8 @@ to override this behavior and just use the first correspondent or type
it finds. This option does not apply to tags, since any amount of tags
can be applied to a document.
If you want to suggest changes but not apply them, specify `--suggest`.
Finally, `-f` specifies that you wish to overwrite already assigned
correspondents, types and/or tags. The default behavior is to not assign
correspondents and types to documents that have this data already

View File

@ -15,6 +15,7 @@ class DocumentsConfig(AppConfig):
from documents.signals.handlers import run_workflows_added
from documents.signals.handlers import run_workflows_updated
from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_custom_fields
from documents.signals.handlers import set_document_type
from documents.signals.handlers import set_storage_path
from documents.signals.handlers import set_tags
@ -24,6 +25,7 @@ class DocumentsConfig(AppConfig):
document_consumption_finished.connect(set_document_type)
document_consumption_finished.connect(set_tags)
document_consumption_finished.connect(set_storage_path)
document_consumption_finished.connect(set_custom_fields)
document_consumption_finished.connect(add_to_index)
document_consumption_finished.connect(run_workflows_added)
document_updated.connect(run_workflows_updated)

View File

@ -7,6 +7,7 @@ from documents.classifier import load_classifier
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document
from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_custom_fields
from documents.signals.handlers import set_document_type
from documents.signals.handlers import set_storage_path
from documents.signals.handlers import set_tags
@ -17,9 +18,9 @@ logger = logging.getLogger("paperless.management.retagger")
class Command(ProgressBarMixin, BaseCommand):
help = (
"Using the current classification model, assigns correspondents, tags "
"and document types to all documents, effectively allowing you to "
"back-tag all previously indexed documents with metadata created (or "
"modified) after their initial import."
"document types, storage paths and custom fields to all documents, effectively"
"allowing you to back-tag all previously indexed documents with metadata created "
"(or modified) after their initial import."
)
def add_arguments(self, parser):
@ -27,6 +28,12 @@ class Command(ProgressBarMixin, BaseCommand):
parser.add_argument("-T", "--tags", default=False, action="store_true")
parser.add_argument("-t", "--document_type", default=False, action="store_true")
parser.add_argument("-s", "--storage_path", default=False, action="store_true")
parser.add_argument(
"-cf",
"--custom_fields",
default=False,
action="store_true",
)
parser.add_argument("-i", "--inbox-only", default=False, action="store_true")
parser.add_argument(
"--use-first",
@ -134,3 +141,16 @@ class Command(ProgressBarMixin, BaseCommand):
stdout=self.stdout,
style_func=self.style,
)
if options["custom_fields"]:
set_custom_fields(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
stdout=self.stdout,
style_func=self.style,
)

View File

@ -132,6 +132,25 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
)
def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None):
predicted_custom_field_ids = (
classifier.predict_custom_fields(document.content) if classifier else []
)
fields = [instance.field for instance in document.custom_fields.all()]
return list(
filter(
lambda o: matches(o, document)
or (
o.matching_algorithm == MatchingModel.MATCH_AUTO
and o.pk in predicted_custom_field_ids
),
fields,
),
)
def matches(matching_model: MatchingModel, document: Document):
search_kwargs = {}

View File

@ -318,6 +318,67 @@ def set_storage_path(
document.save(update_fields=("storage_path",))
def set_custom_fields(
document: Document,
logging_group=None,
classifier: DocumentClassifier | None = None,
replace=False,
suggest=False,
base_url=None,
stdout=None,
style_func=None,
**kwargs,
):
if replace:
CustomFieldInstance.objects.filter(document=document).exclude(
Q(field__match="") & ~Q(field__matching_algorithm=CustomField.MATCH_AUTO),
).delete()
current_fields = set([instance.field for instance in document.custom_fields.all()])
matched_fields = matching.match_custom_fields(document, classifier)
relevant_fields = set(matched_fields) - current_fields
if suggest:
extra_fields = current_fields - set(matched_fields)
extra_fields = [
f for f in extra_fields if f.matching_algorithm == MatchingModel.MATCH_AUTO
]
if not relevant_fields and not extra_fields:
return
doc_str = style_func.SUCCESS(str(document))
if base_url:
stdout.write(doc_str)
stdout.write(f"{base_url}/documents/{document.pk}")
else:
stdout.write(doc_str + style_func.SUCCESS(f" [{document.pk}]"))
if relevant_fields:
stdout.write(
"Suggest custom fields: "
+ ", ".join([f.name for f in relevant_fields]),
)
if extra_fields:
stdout.write(
"Extra custom fields: " + ", ".join([f.name for f in extra_fields]),
)
else:
if not relevant_fields:
return
message = 'Assigning custom fields "{}" to "{}"'
logger.info(
message.format(document, ", ".join([f.name for f in relevant_fields])),
extra={"group": logging_group},
)
for field in relevant_fields:
CustomFieldInstance.objects.create(
field=field,
document=document,
)
# see empty_trash in documents/tasks.py for signal handling
def cleanup_document_deletion(sender, instance, **kwargs):
with FileLock(settings.MEDIA_LOCK):