From 60618381f814083913cfddf81da8cca9415d1599 Mon Sep 17 00:00:00 2001 From: Jonas Winkler Date: Tue, 25 Sep 2018 16:09:33 +0200 Subject: [PATCH] Code style adjustments --- src/documents/actions.py | 63 +++++++++++++------ src/documents/admin.py | 12 ++-- src/documents/classifier.py | 2 +- src/documents/filters.py | 0 .../commands/document_create_classifier.py | 8 +-- .../commands/document_create_dataset.py | 40 ------------ .../management/commands/document_retagger.py | 9 ++- src/documents/models.py | 19 ++++-- src/documents/signals/handlers.py | 11 +++- src/documents/views.py | 3 +- src/paperless/settings.py | 5 +- src/paperless/urls.py | 3 +- 12 files changed, 94 insertions(+), 81 deletions(-) mode change 100644 => 100755 src/documents/classifier.py mode change 100644 => 100755 src/documents/filters.py mode change 100644 => 100755 src/documents/management/commands/document_create_classifier.py delete mode 100644 src/documents/management/commands/document_create_dataset.py mode change 100644 => 100755 src/documents/management/commands/document_retagger.py mode change 100644 => 100755 src/documents/models.py mode change 100644 => 100755 src/documents/signals/handlers.py mode change 100644 => 100755 src/documents/views.py mode change 100644 => 100755 src/paperless/urls.py diff --git a/src/documents/actions.py b/src/documents/actions.py index 96ce893aa..6e1cad45c 100755 --- a/src/documents/actions.py +++ b/src/documents/actions.py @@ -18,9 +18,9 @@ def select_action( if not modeladmin.has_change_permission(request): raise PermissionDenied - if request.POST.get('post'): + if request.POST.get("post"): n = queryset.count() - selected_object = modelclass.objects.get(id=request.POST.get('obj_id')) + selected_object = modelclass.objects.get(id=request.POST.get("obj_id")) if n: for document in queryset: if document_action: @@ -139,28 +139,52 @@ def remove_correspondent_from_selected(modeladmin, request, queryset): def set_document_type_on_selected(modeladmin, request, queryset): - return select_action(modeladmin=modeladmin, request=request, queryset=queryset, - title="Set document type on multiple documents", - action="set_document_type_on_selected", - modelclass=DocumentType, - success_message="Successfully set document type %(selected_object)s on %(count)d %(items)s.", - queryset_action=lambda qs, document_type: qs.update(document_type=document_type)) + return select_action( + modeladmin=modeladmin, + request=request, + queryset=queryset, + title="Set document type on multiple documents", + action="set_document_type_on_selected", + modelclass=DocumentType, + success_message="Successfully set document type %(selected_object)s " + "on %(count)d %(items)s.", + queryset_action=lambda qs, document_type: qs.update( + document_type=document_type) + ) def remove_document_type_from_selected(modeladmin, request, queryset): - return simple_action(modeladmin=modeladmin, request=request, queryset=queryset, - success_message="Successfully removed document type from %(count)d %(items)s.", - queryset_action=lambda qs: qs.update(document_type=None)) + return simple_action( + modeladmin=modeladmin, + request=request, + queryset=queryset, + success_message="Successfully removed document type from %(count)d " + "%(items)s.", + queryset_action=lambda qs: qs.update(document_type=None) + ) def run_document_classifier_on_selected(modeladmin, request, queryset): try: clf = DocumentClassifier.load_classifier() - return simple_action(modeladmin=modeladmin, request=request, queryset=queryset, - success_message="Successfully applied document classifier to %(count)d %(items)s.", - document_action=lambda doc: clf.classify_document(doc, classify_correspondent=True, classify_tags=True, classify_document_type=True)) + return simple_action( + modeladmin=modeladmin, + request=request, + queryset=queryset, + success_message="Successfully applied document classifier to " + "%(count)d %(items)s.", + document_action=lambda doc: clf.classify_document( + doc, + classify_correspondent=True, + classify_tags=True, + classify_document_type=True) + ) except FileNotFoundError: - modeladmin.message_user(request, "Classifier model file not found.", messages.ERROR) + modeladmin.message_user( + request, + "Classifier model file not found.", + messages.ERROR + ) return None @@ -171,7 +195,10 @@ set_correspondent_on_selected.short_description = \ "Set correspondent on selected documents" remove_correspondent_from_selected.short_description = \ "Remove correspondent from selected documents" -set_document_type_on_selected.short_description = "Set document type on selected documents" -remove_document_type_from_selected.short_description = "Remove document type from selected documents" -run_document_classifier_on_selected.short_description = "Run document classifier on selected" +set_document_type_on_selected.short_description = \ + "Set document type on selected documents" +remove_document_type_from_selected.short_description = \ + "Remove document type from selected documents" +run_document_classifier_on_selected.short_description = \ + "Run document classifier on selected" diff --git a/src/documents/admin.py b/src/documents/admin.py index d739011d4..73d449755 100755 --- a/src/documents/admin.py +++ b/src/documents/admin.py @@ -124,7 +124,7 @@ class CorrespondentAdmin(CommonAdmin): "document_count", "last_correspondence" ) - list_editable = ("automatic_classification") + list_editable = ("automatic_classification",) def get_queryset(self, request): qs = super(CorrespondentAdmin, self).get_queryset(request) @@ -145,7 +145,11 @@ class CorrespondentAdmin(CommonAdmin): class TagAdmin(CommonAdmin): - list_display = ("name", "colour", "automatic_classification", "document_count") + list_display = ( + "name", + "colour", + "automatic_classification", + "document_count") list_filter = ("colour",) list_editable = ("colour", "automatic_classification") @@ -238,8 +242,8 @@ class DocumentAdmin(CommonAdmin): extra_context = extra_context or {} doc = Document.objects.get(id=object_id) - extra_context['download_url'] = doc.download_url - extra_context['file_type'] = doc.file_type + extra_context["download_url"] = doc.download_url + extra_context["file_type"] = doc.file_type if self.document_queue and object_id: if int(object_id) in self.document_queue: diff --git a/src/documents/classifier.py b/src/documents/classifier.py old mode 100644 new mode 100755 index 8e76e0f02..bcfc1feb0 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -87,7 +87,7 @@ class DocumentClassifier(object): # Step 2: vectorize data logging.getLogger(__name__).info("Vectorizing data...") - self.data_vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 5), min_df=0.1) + self.data_vectorizer = CountVectorizer(analyzer="char", ngram_range=(3, 5), min_df=0.1) data_vectorized = self.data_vectorizer.fit_transform(data) self.tags_binarizer = MultiLabelBinarizer() diff --git a/src/documents/filters.py b/src/documents/filters.py old mode 100644 new mode 100755 diff --git a/src/documents/management/commands/document_create_classifier.py b/src/documents/management/commands/document_create_classifier.py old mode 100644 new mode 100755 index 610fa4898..79f766c55 --- a/src/documents/management/commands/document_create_classifier.py +++ b/src/documents/management/commands/document_create_classifier.py @@ -1,6 +1,4 @@ import logging -import os.path -import pickle from django.core.management.base import BaseCommand from documents.classifier import DocumentClassifier @@ -19,9 +17,7 @@ class Command(Renderable, BaseCommand): def handle(self, *args, **options): clf = DocumentClassifier() - clf.train() - - logging.getLogger(__name__).info("Saving models to " + settings.MODEL_FILE + "...") - + logging.getLogger(__name__).info("Saving models to " + + settings.MODEL_FILE + "...") clf.save_classifier() diff --git a/src/documents/management/commands/document_create_dataset.py b/src/documents/management/commands/document_create_dataset.py deleted file mode 100644 index a24f56680..000000000 --- a/src/documents/management/commands/document_create_dataset.py +++ /dev/null @@ -1,40 +0,0 @@ -from django.core.management.base import BaseCommand - -from documents.classifier import preprocess_content -from documents.models import Document -from ...mixins import Renderable - - -class Command(Renderable, BaseCommand): - - help = """ - There is no help. - """.replace(" ", "") - - def __init__(self, *args, **kwargs): - BaseCommand.__init__(self, *args, **kwargs) - - def handle(self, *args, **options): - with open("dataset_tags.txt", "w") as f: - for doc in Document.objects.exclude(tags__is_inbox_tag=True): - labels = [] - for tag in doc.tags.filter(automatic_classification=True): - labels.append(tag.name) - f.write(",".join(labels)) - f.write(";") - f.write(preprocess_content(doc.content)) - f.write("\n") - - with open("dataset_types.txt", "w") as f: - for doc in Document.objects.exclude(tags__is_inbox_tag=True): - f.write(doc.document_type.name if doc.document_type is not None and doc.document_type.automatic_classification else "-") - f.write(";") - f.write(preprocess_content(doc.content)) - f.write("\n") - - with open("dataset_correspondents.txt", "w") as f: - for doc in Document.objects.exclude(tags__is_inbox_tag=True): - f.write(doc.correspondent.name if doc.correspondent is not None and doc.correspondent.automatic_classification else "-") - f.write(";") - f.write(preprocess_content(doc.content)) - f.write("\n") diff --git a/src/documents/management/commands/document_retagger.py b/src/documents/management/commands/document_retagger.py old mode 100644 new mode 100755 index 149812f83..5bc8614d6 --- a/src/documents/management/commands/document_retagger.py +++ b/src/documents/management/commands/document_retagger.py @@ -11,7 +11,10 @@ from ...mixins import Renderable class Command(Renderable, BaseCommand): help = """ - There is no help. #TODO + Using the current classification model, assigns correspondents, tags + and document types to all documents, effectively allowing you to + back-tag all previously indexed documents with metadata created (or + modified) after their initial import. """.replace(" ", "") def __init__(self, *args, **kwargs): @@ -44,7 +47,7 @@ class Command(Renderable, BaseCommand): self.verbosity = options["verbosity"] - if options['inbox_only']: + if options["inbox_only"]: documents = Document.objects.filter(tags__is_inbox_tag=True).exclude(tags__is_archived_tag=True).distinct() else: documents = Document.objects.all().exclude(tags__is_archived_tag=True).distinct() @@ -58,4 +61,4 @@ class Command(Renderable, BaseCommand): for document in documents: logging.getLogger(__name__).info("Processing document {}".format(document.title)) - clf.classify_document(document, classify_document_type=options['type'], classify_tags=options['tags'], classify_correspondent=options['correspondent'], replace_tags=options['replace_tags']) + clf.classify_document(document, classify_document_type=options["type"], classify_tags=options["tags"], classify_correspondent=options["correspondent"], replace_tags=options["replace_tags"]) diff --git a/src/documents/models.py b/src/documents/models.py old mode 100644 new mode 100755 index ef2e8a862..18fb3462a --- a/src/documents/models.py +++ b/src/documents/models.py @@ -26,7 +26,11 @@ class MatchingModel(models.Model): name = models.CharField(max_length=128, unique=True) slug = models.SlugField(blank=True) - automatic_classification = models.BooleanField(default=False, help_text='Automatically assign to newly added documents based on current usage in your document collection.') + automatic_classification = models.BooleanField( + default=False, + help_text="Automatically assign to newly added documents based on " + "current usage in your document collection." + ) class Meta: abstract = True @@ -75,11 +79,16 @@ class Tag(MatchingModel): is_inbox_tag = models.BooleanField( default=False, - help_text="Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags.") + help_text="Marks this tag as an inbox tag: All newly consumed " + "documents will be tagged with inbox tags." + ) is_archived_tag = models.BooleanField( default=False, - help_text="Marks this tag as an archive tag: All documents tagged with archive tags will never be modified automatically (i.e., modifying tags by matching rules)") + help_text="Marks this tag as an archive tag: All documents tagged " + "with archive tags will never be modified automatically " + "(i.e., modifying tags by matching rules)" + ) class DocumentType(MatchingModel): @@ -170,7 +179,9 @@ class Document(models.Model): null=True, unique=True, db_index=True, - help_text="The position of this document in your physical document archive.") + help_text="The position of this document in your physical document " + "archive." + ) class Meta: ordering = ("correspondent", "title") diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py old mode 100644 new mode 100755 index 15fa9e10d..a64a286b7 --- a/src/documents/signals/handlers.py +++ b/src/documents/signals/handlers.py @@ -23,9 +23,16 @@ def classify_document(sender, document=None, logging_group=None, **kwargs): global classifier try: classifier.reload() - classifier.classify_document(document, classify_correspondent=True, classify_tags=True, classify_document_type=True) + classifier.classify_document( + document, + classify_correspondent=True, + classify_tags=True, + classify_document_type=True + ) except FileNotFoundError: - logging.getLogger(__name__).fatal("Cannot classify document, classifier model file was not found.") + logging.getLogger(__name__).fatal( + "Cannot classify document, classifier model file was not found." + ) def add_inbox_tags(sender, document=None, logging_group=None, **kwargs): diff --git a/src/documents/views.py b/src/documents/views.py old mode 100644 new mode 100755 index 4f2e53ea5..05f8f742c --- a/src/documents/views.py +++ b/src/documents/views.py @@ -28,7 +28,8 @@ from .serialisers import ( DocumentSerializer, LogSerializer, TagSerializer, - DocumentTypeSerializer) + DocumentTypeSerializer +) class IndexView(TemplateView): diff --git a/src/paperless/settings.py b/src/paperless/settings.py index b3725f4b6..d39e1cf5d 100755 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -201,7 +201,10 @@ MEDIA_URL = os.getenv("PAPERLESS_MEDIA_URL", "/media/") # Document classification models location MODEL_FILE = os.getenv( - "PAPERLESS_MODEL_FILE", os.path.join(BASE_DIR, "..", "models", "model.pickle")) + "PAPERLESS_MODEL_FILE", os.path.join( + BASE_DIR, "..", "models", "model.pickle" + ) +) # Paperless-specific stuff diff --git a/src/paperless/urls.py b/src/paperless/urls.py old mode 100644 new mode 100755 index 2ca0faf02..4e33c4e12 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -13,7 +13,8 @@ from documents.views import ( LogViewSet, PushView, TagViewSet, - DocumentTypeViewSet) + DocumentTypeViewSet +) from reminders.views import ReminderViewSet router = DefaultRouter()