diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 1b70dcd6f..6e0d6f946 100755 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -30,10 +30,12 @@ class DocumentClassifier(object): FORMAT_VERSION = 5 def __init__(self): - # mtime of the model file on disk. used to prevent reloading when nothing has changed. + # mtime of the model file on disk. used to prevent reloading when + # nothing has changed. self.classifier_version = 0 - # hash of the training data. used to prevent re-training when the training data has not changed. + # hash of the training data. used to prevent re-training when the + # training data has not changed. self.data_hash = None self.data_vectorizer = None @@ -48,10 +50,12 @@ class DocumentClassifier(object): schema_version = pickle.load(f) if schema_version != self.FORMAT_VERSION: - raise IncompatibleClassifierVersionError("Cannor load classifier, incompatible versions.") + raise IncompatibleClassifierVersionError( + "Cannor load classifier, incompatible versions.") else: if self.classifier_version > 0: - logger.info("Classifier updated on disk, reloading classifier models") + logger.info("Classifier updated on disk, " + "reloading classifier models") self.data_hash = pickle.load(f) self.data_vectorizer = pickle.load(f) self.tags_binarizer = pickle.load(f) @@ -82,20 +86,22 @@ class DocumentClassifier(object): # Step 1: Extract and preprocess training data from the database. logging.getLogger(__name__).debug("Gathering data from database...") m = hashlib.sha1() - for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): + for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): # NOQA: E501 preprocessed_content = preprocess_content(doc.content) m.update(preprocessed_content.encode('utf-8')) data.append(preprocessed_content) y = -1 - if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO: - y = doc.document_type.pk + dt = doc.document_type + if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO: + y = dt.pk m.update(y.to_bytes(4, 'little', signed=True)) labels_document_type.append(y) y = -1 - if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO: - y = doc.correspondent.pk + cor = doc.correspondent + if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO: + y = cor.pk m.update(y.to_bytes(4, 'little', signed=True)) labels_correspondent.append(y) @@ -145,7 +151,7 @@ class DocumentClassifier(object): # Step 3: train the classifiers if num_tags > 0: logging.getLogger(__name__).debug("Training tags classifier...") - self.tags_classifier = MLPClassifier(verbose=True, tol=0.01) + self.tags_classifier = MLPClassifier(tol=0.01) self.tags_classifier.fit(data_vectorized, labels_tags_vectorized) else: self.tags_classifier = None @@ -157,7 +163,7 @@ class DocumentClassifier(object): logging.getLogger(__name__).debug( "Training correspondent classifier..." ) - self.correspondent_classifier = MLPClassifier(verbose=True, tol=0.01) + self.correspondent_classifier = MLPClassifier(tol=0.01) self.correspondent_classifier.fit( data_vectorized, labels_correspondent @@ -173,7 +179,7 @@ class DocumentClassifier(object): logging.getLogger(__name__).debug( "Training document type classifier..." ) - self.document_type_classifier = MLPClassifier(verbose=True, tol=0.01) + self.document_type_classifier = MLPClassifier(tol=0.01) self.document_type_classifier.fit( data_vectorized, labels_document_type diff --git a/src/documents/file_handling.py b/src/documents/file_handling.py index cd47406b6..ee7e9b761 100644 --- a/src/documents/file_handling.py +++ b/src/documents/file_handling.py @@ -65,25 +65,24 @@ def many_to_dictionary(field): return mydictionary -def generate_filename(document): - # Create filename based on configured format +def generate_filename(doc): path = "" try: if settings.PAPERLESS_FILENAME_FORMAT is not None: tags = defaultdict(lambda: slugify(None), - many_to_dictionary(document.tags)) + many_to_dictionary(doc.tags)) path = settings.PAPERLESS_FILENAME_FORMAT.format( - correspondent=slugify(document.correspondent), - title=slugify(document.title), - created=slugify(document.created), - created_year=document.created.year if document.created else "none", - created_month=document.created.month if document.created else "none", - created_day=document.created.day if document.created else "none", - added=slugify(document.added), - added_year=document.added.year if document.added else "none", - added_month=document.added.month if document.added else "none", - added_day=document.added.day if document.added else "none", + correspondent=slugify(doc.correspondent), + title=slugify(doc.title), + created=slugify(doc.created), + created_year=doc.created.year if doc.created else "none", + created_month=doc.created.month if doc.created else "none", + created_day=doc.created.day if doc.created else "none", + added=slugify(doc.added), + added_year=doc.added.year if doc.added else "none", + added_month=doc.added.month if doc.added else "none", + added_day=doc.added.day if doc.added else "none", tags=tags, ) except (ValueError, KeyError, IndexError): @@ -93,12 +92,12 @@ def generate_filename(document): # Always append the primary key to guarantee uniqueness of filename if len(path) > 0: - filename = "%s-%07i%s" % (path, document.pk, document.file_type) + filename = "%s-%07i%s" % (path, doc.pk, doc.file_type) else: - filename = "%07i%s" % (document.pk, document.file_type) + filename = "%07i%s" % (doc.pk, doc.file_type) # Append .gpg for encrypted files - if document.storage_type == document.STORAGE_TYPE_GPG: + if doc.storage_type == doc.STORAGE_TYPE_GPG: filename += ".gpg" return filename diff --git a/src/documents/matching.py b/src/documents/matching.py index ae1a9a9cf..212698ad3 100644 --- a/src/documents/matching.py +++ b/src/documents/matching.py @@ -12,7 +12,10 @@ def match_correspondents(document_content, classifier): pred_id = None correspondents = Correspondent.objects.all() - return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id] + + return list(filter( + lambda o: matches(o, document_content) or o.pk == pred_id, + correspondents)) def match_document_types(document_content, classifier): @@ -22,15 +25,23 @@ def match_document_types(document_content, classifier): pred_id = None document_types = DocumentType.objects.all() - return [o for o in document_types if matches(o, document_content) or o.pk == pred_id] + + return list(filter( + lambda o: matches(o, document_content) or o.pk == pred_id, + document_types)) def match_tags(document_content, classifier): - objects = Tag.objects.all() - predicted_tag_ids = classifier.predict_tags(document_content) if classifier else [] + if classifier: + predicted_tag_ids = classifier.predict_tags(document_content) + else: + predicted_tag_ids = [] - matched_tags = [o for o in objects if matches(o, document_content) or o.pk in predicted_tag_ids] - return matched_tags + tags = Tag.objects.all() + + return list(filter( + lambda o: matches(o, document_content) or o.pk in predicted_tag_ids, + tags)) def matches(matching_model, document_content): @@ -48,39 +59,45 @@ def matches(matching_model, document_content): if matching_model.matching_algorithm == MatchingModel.MATCH_ALL: for word in _split_match(matching_model): search_result = re.search( - r"\b{}\b".format(word), document_content, **search_kwargs) + rf"\b{word}\b", document_content, **search_kwargs) if not search_result: return False return True - if matching_model.matching_algorithm == MatchingModel.MATCH_ANY: + elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY: for word in _split_match(matching_model): - if re.search(r"\b{}\b".format(word), document_content, **search_kwargs): + if re.search(rf"\b{word}\b", document_content, **search_kwargs): return True return False - if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL: + elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL: return bool(re.search( - r"\b{}\b".format(matching_model.match), document_content, **search_kwargs)) + rf"\b{matching_model.match}\b", + document_content, + **search_kwargs + )) - if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX: + elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX: return bool(re.search( - re.compile(matching_model.match, **search_kwargs), document_content)) + re.compile(matching_model.match, **search_kwargs), + document_content + )) - if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY: + elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY: match = re.sub(r'[^\w\s]', '', matching_model.match) text = re.sub(r'[^\w\s]', '', document_content) if matching_model.is_insensitive: match = match.lower() text = text.lower() - return True if fuzz.partial_ratio(match, text) >= 90 else False + return fuzz.partial_ratio(match, text) >= 90 - if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO: + elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO: # this is done elsewhere. return False - raise NotImplementedError("Unsupported matching algorithm") + else: + raise NotImplementedError("Unsupported matching algorithm") def _split_match(matching_model): diff --git a/src/paperless/auth.py b/src/paperless/auth.py index ecd697f0e..83279ef36 100644 --- a/src/paperless/auth.py +++ b/src/paperless/auth.py @@ -9,7 +9,7 @@ class AngularApiAuthenticationOverride(authentication.BaseAuthentication): """ def authenticate(self, request): - if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'): + if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'): # NOQA: E501 user = User.objects.filter(is_staff=True).first() print("Auto-Login with user {}".format(user)) return (user, None) diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py index 03f915769..9d0397f24 100644 --- a/src/paperless_mail/mail.py +++ b/src/paperless_mail/mail.py @@ -283,8 +283,8 @@ class MailAccountHandler(LoggingMixin): path=temp_filename, override_filename=att.filename, override_title=title, - override_correspondent_id=correspondent.id if correspondent else None, - override_document_type_id=doc_type.id if doc_type else None, + override_correspondent_id=correspondent.id if correspondent else None, # NOQA: E501 + override_document_type_id=doc_type.id if doc_type else None, # NOQA: E501 override_tag_ids=[tag.id] if tag else None, task_name=f"Mail: {att.filename}" ) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index d0ce01327..c9e77486e 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -123,7 +123,7 @@ class RasterisedDocumentParser(DocumentParser): ocr_pages = self._complete_ocr_default_language( images, sample_page_index, sample_page_text) - elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): + elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501 self.log( "warning", f"Detected language {guessed_language} is not available "