mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Implemented the classifier model, including automatic tagging of new documents
This commit is contained in:
@@ -8,6 +8,7 @@ from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
from ..models import Correspondent, Document, Tag, DocumentType
|
||||
|
||||
|
||||
@@ -15,79 +16,16 @@ def logger(message, group):
|
||||
logging.getLogger(__name__).debug(message, extra={"group": group})
|
||||
|
||||
|
||||
def set_correspondent(sender, document=None, logging_group=None, **kwargs):
|
||||
|
||||
# No sense in assigning a correspondent when one is already set.
|
||||
if document.correspondent:
|
||||
return
|
||||
|
||||
# No matching correspondents, so no need to continue
|
||||
potential_correspondents = list(Correspondent.match_all(document.content))
|
||||
if not potential_correspondents:
|
||||
return
|
||||
|
||||
potential_count = len(potential_correspondents)
|
||||
selected = potential_correspondents[0]
|
||||
if potential_count > 1:
|
||||
message = "Detected {} potential correspondents, so we've opted for {}"
|
||||
logger(
|
||||
message.format(potential_count, selected),
|
||||
logging_group
|
||||
)
|
||||
|
||||
logger(
|
||||
'Assigning correspondent "{}" to "{}" '.format(selected, document),
|
||||
logging_group
|
||||
)
|
||||
|
||||
document.correspondent = selected
|
||||
document.save(update_fields=("correspondent",))
|
||||
classifier = None
|
||||
|
||||
|
||||
def set_document_type(sender, document=None, logging_group=None, **kwargs):
|
||||
def classify_document(sender, document=None, logging_group=None, **kwargs):
|
||||
global classifier
|
||||
if classifier is None:
|
||||
classifier = DocumentClassifier.load_classifier()
|
||||
|
||||
# No sense in assigning a correspondent when one is already set.
|
||||
if document.document_type:
|
||||
return
|
||||
classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_type=True)
|
||||
|
||||
# No matching document types, so no need to continue
|
||||
potential_document_types = list(DocumentType.match_all(document.content))
|
||||
if not potential_document_types:
|
||||
return
|
||||
|
||||
potential_count = len(potential_document_types)
|
||||
selected = potential_document_types[0]
|
||||
if potential_count > 1:
|
||||
message = "Detected {} potential document types, so we've opted for {}"
|
||||
logger(
|
||||
message.format(potential_count, selected),
|
||||
logging_group
|
||||
)
|
||||
|
||||
logger(
|
||||
'Assigning document type "{}" to "{}" '.format(selected, document),
|
||||
logging_group
|
||||
)
|
||||
|
||||
document.document_type = selected
|
||||
document.save(update_fields=("document_type",))
|
||||
|
||||
|
||||
def set_tags(sender, document=None, logging_group=None, **kwargs):
|
||||
|
||||
current_tags = set(document.tags.all())
|
||||
relevant_tags = (set(Tag.match_all(document.content)) | set(Tag.objects.filter(is_inbox_tag=True))) - current_tags
|
||||
|
||||
if not relevant_tags:
|
||||
return
|
||||
|
||||
message = 'Tagging "{}" with "{}"'
|
||||
logger(
|
||||
message.format(document, ", ".join([t.slug for t in relevant_tags])),
|
||||
logging_group
|
||||
)
|
||||
|
||||
document.tags.add(*relevant_tags)
|
||||
|
||||
|
||||
def run_pre_consume_script(sender, filename, **kwargs):
|
||||
|
Reference in New Issue
Block a user