code cleanup

2025-12-14 01:21:14 -06:00 · 2020-11-21 15:34:00 +01:00
parent afc3753e58
commit a532200d10
6 changed files with 71 additions and 49 deletions
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -30,10 +30,12 @@ class DocumentClassifier(object):
    FORMAT_VERSION = 5
    def __init__(self):
-        # mtime of the model file on disk. used to prevent reloading when nothing has changed.
+        # mtime of the model file on disk. used to prevent reloading when
        # nothing has changed.
        self.classifier_version = 0
-        # hash of the training data. used to prevent re-training when the training data has not changed.
+        # hash of the training data. used to prevent re-training when the
        # training data has not changed.
        self.data_hash = None
        self.data_vectorizer = None
@@ -48,10 +50,12 @@ class DocumentClassifier(object):
                schema_version = pickle.load(f)
                if schema_version != self.FORMAT_VERSION:
-                    raise IncompatibleClassifierVersionError("Cannor load classifier, incompatible versions.")
+                    raise IncompatibleClassifierVersionError(
                        "Cannor load classifier, incompatible versions.")
                else:
                    if self.classifier_version > 0:
-                        logger.info("Classifier updated on disk, reloading classifier models")
+                        logger.info("Classifier updated on disk, "
                                    "reloading classifier models")
                    self.data_hash = pickle.load(f)
                    self.data_vectorizer = pickle.load(f)
                    self.tags_binarizer = pickle.load(f)
@@ -82,20 +86,22 @@ class DocumentClassifier(object):
        # Step 1: Extract and preprocess training data from the database.
        logging.getLogger(__name__).debug("Gathering data from database...")
        m = hashlib.sha1()
-        for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):
+        for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):  # NOQA: E501
            preprocessed_content = preprocess_content(doc.content)
            m.update(preprocessed_content.encode('utf-8'))
            data.append(preprocessed_content)
            y = -1
-            if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
+            dt = doc.document_type
-                y = doc.document_type.pk
+            if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
                y = dt.pk
            m.update(y.to_bytes(4, 'little', signed=True))
            labels_document_type.append(y)
            y = -1
-            if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
+            cor = doc.correspondent
-                y = doc.correspondent.pk
+            if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
                y = cor.pk
            m.update(y.to_bytes(4, 'little', signed=True))
            labels_correspondent.append(y)
@@ -145,7 +151,7 @@ class DocumentClassifier(object):
        # Step 3: train the classifiers
        if num_tags > 0:
            logging.getLogger(__name__).debug("Training tags classifier...")
-            self.tags_classifier = MLPClassifier(verbose=True, tol=0.01)
+            self.tags_classifier = MLPClassifier(tol=0.01)
            self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
        else:
            self.tags_classifier = None
@@ -157,7 +163,7 @@ class DocumentClassifier(object):
            logging.getLogger(__name__).debug(
                "Training correspondent classifier..."
            )
-            self.correspondent_classifier = MLPClassifier(verbose=True, tol=0.01)
+            self.correspondent_classifier = MLPClassifier(tol=0.01)
            self.correspondent_classifier.fit(
                data_vectorized,
                labels_correspondent
@@ -173,7 +179,7 @@ class DocumentClassifier(object):
            logging.getLogger(__name__).debug(
                "Training document type classifier..."
            )
-            self.document_type_classifier = MLPClassifier(verbose=True, tol=0.01)
+            self.document_type_classifier = MLPClassifier(tol=0.01)
            self.document_type_classifier.fit(
                data_vectorized,
                labels_document_type
--- a/src/documents/file_handling.py
+++ b/src/documents/file_handling.py
@@ -65,25 +65,24 @@ def many_to_dictionary(field):
    return mydictionary
-def generate_filename(document):
+def generate_filename(doc):
    # Create filename based on configured format
    path = ""
    try:
        if settings.PAPERLESS_FILENAME_FORMAT is not None:
            tags = defaultdict(lambda: slugify(None),
-                               many_to_dictionary(document.tags))
+                               many_to_dictionary(doc.tags))
            path = settings.PAPERLESS_FILENAME_FORMAT.format(
-                correspondent=slugify(document.correspondent),
+                correspondent=slugify(doc.correspondent),
-                title=slugify(document.title),
+                title=slugify(doc.title),
-                created=slugify(document.created),
+                created=slugify(doc.created),
-                created_year=document.created.year if document.created else "none",
+                created_year=doc.created.year if doc.created else "none",
-                created_month=document.created.month if document.created else "none",
+                created_month=doc.created.month if doc.created else "none",
-                created_day=document.created.day if document.created else "none",
+                created_day=doc.created.day if doc.created else "none",
-                added=slugify(document.added),
+                added=slugify(doc.added),
-                added_year=document.added.year if document.added else "none",
+                added_year=doc.added.year if doc.added else "none",
-                added_month=document.added.month if document.added else "none",
+                added_month=doc.added.month if doc.added else "none",
-                added_day=document.added.day if document.added else "none",
+                added_day=doc.added.day if doc.added else "none",
                tags=tags,
            )
    except (ValueError, KeyError, IndexError):
@@ -93,12 +92,12 @@ def generate_filename(document):
    # Always append the primary key to guarantee uniqueness of filename
    if len(path) > 0:
-        filename = "%s-%07i%s" % (path, document.pk, document.file_type)
+        filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)
    else:
-        filename = "%07i%s" % (document.pk, document.file_type)
+        filename = "%07i%s" % (doc.pk, doc.file_type)
    # Append .gpg for encrypted files
-    if document.storage_type == document.STORAGE_TYPE_GPG:
+    if doc.storage_type == doc.STORAGE_TYPE_GPG:
        filename += ".gpg"
    return filename
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -12,7 +12,10 @@ def match_correspondents(document_content, classifier):
        pred_id = None
    correspondents = Correspondent.objects.all()
-    return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id]
+
    return list(filter(
        lambda o: matches(o, document_content) or o.pk == pred_id,
        correspondents))
 def match_document_types(document_content, classifier):
@@ -22,15 +25,23 @@ def match_document_types(document_content, classifier):
        pred_id = None
    document_types = DocumentType.objects.all()
-    return [o for o in document_types if matches(o, document_content) or o.pk == pred_id]
+
    return list(filter(
        lambda o: matches(o, document_content) or o.pk == pred_id,
        document_types))
 def match_tags(document_content, classifier):
-    objects = Tag.objects.all()
+    if classifier:
-    predicted_tag_ids = classifier.predict_tags(document_content) if classifier else []
+        predicted_tag_ids = classifier.predict_tags(document_content)
    else:
        predicted_tag_ids = []
-    matched_tags = [o for o in objects if matches(o, document_content) or o.pk in predicted_tag_ids]
+    tags = Tag.objects.all()
-    return matched_tags
+
    return list(filter(
        lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
        tags))
 def matches(matching_model, document_content):
@@ -48,38 +59,44 @@ def matches(matching_model, document_content):
    if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
        for word in _split_match(matching_model):
            search_result = re.search(
-                r"\b{}\b".format(word), document_content, **search_kwargs)
+                rf"\b{word}\b", document_content, **search_kwargs)
            if not search_result:
                return False
        return True
-    if matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
        for word in _split_match(matching_model):
-            if re.search(r"\b{}\b".format(word), document_content, **search_kwargs):
+            if re.search(rf"\b{word}\b", document_content, **search_kwargs):
                return True
        return False
-    if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
        return bool(re.search(
-            r"\b{}\b".format(matching_model.match), document_content, **search_kwargs))
+            rf"\b{matching_model.match}\b",
            document_content,
            **search_kwargs
        ))
-    if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
        return bool(re.search(
-            re.compile(matching_model.match, **search_kwargs), document_content))
+            re.compile(matching_model.match, **search_kwargs),
            document_content
        ))
-    if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
        match = re.sub(r'[^\w\s]', '', matching_model.match)
        text = re.sub(r'[^\w\s]', '', document_content)
        if matching_model.is_insensitive:
            match = match.lower()
            text = text.lower()
-        return True if fuzz.partial_ratio(match, text) >= 90 else False
+        return fuzz.partial_ratio(match, text) >= 90
-    if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
+    elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
        # this is done elsewhere.
        return False
    else:
        raise NotImplementedError("Unsupported matching algorithm")
--- a/src/paperless/auth.py
+++ b/src/paperless/auth.py
@@ -9,7 +9,7 @@ class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
    """
    def authenticate(self, request):
-        if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'):
+        if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'):  # NOQA: E501
            user = User.objects.filter(is_staff=True).first()
            print("Auto-Login with user {}".format(user))
            return (user, None)
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -283,8 +283,8 @@ class MailAccountHandler(LoggingMixin):
                    path=temp_filename,
                    override_filename=att.filename,
                    override_title=title,
-                    override_correspondent_id=correspondent.id if correspondent else None,
+                    override_correspondent_id=correspondent.id if correspondent else None,  # NOQA: E501
-                    override_document_type_id=doc_type.id if doc_type else None,
+                    override_document_type_id=doc_type.id if doc_type else None,  # NOQA: E501
                    override_tag_ids=[tag.id] if tag else None,
                    task_name=f"Mail: {att.filename}"
                )
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -123,7 +123,7 @@ class RasterisedDocumentParser(DocumentParser):
                ocr_pages = self._complete_ocr_default_language(
                    images, sample_page_index, sample_page_text)
-            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
+            elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501
                self.log(
                    "warning",
                    f"Detected language {guessed_language} is not available "