mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	code cleanup
This commit is contained in:
		| @@ -30,10 +30,12 @@ class DocumentClassifier(object): | |||||||
|     FORMAT_VERSION = 5 |     FORMAT_VERSION = 5 | ||||||
|  |  | ||||||
|     def __init__(self): |     def __init__(self): | ||||||
|         # mtime of the model file on disk. used to prevent reloading when nothing has changed. |         # mtime of the model file on disk. used to prevent reloading when | ||||||
|  |         # nothing has changed. | ||||||
|         self.classifier_version = 0 |         self.classifier_version = 0 | ||||||
|  |  | ||||||
|         # hash of the training data. used to prevent re-training when the training data has not changed. |         # hash of the training data. used to prevent re-training when the | ||||||
|  |         # training data has not changed. | ||||||
|         self.data_hash = None |         self.data_hash = None | ||||||
|  |  | ||||||
|         self.data_vectorizer = None |         self.data_vectorizer = None | ||||||
| @@ -48,10 +50,12 @@ class DocumentClassifier(object): | |||||||
|                 schema_version = pickle.load(f) |                 schema_version = pickle.load(f) | ||||||
|  |  | ||||||
|                 if schema_version != self.FORMAT_VERSION: |                 if schema_version != self.FORMAT_VERSION: | ||||||
|                     raise IncompatibleClassifierVersionError("Cannor load classifier, incompatible versions.") |                     raise IncompatibleClassifierVersionError( | ||||||
|  |                         "Cannor load classifier, incompatible versions.") | ||||||
|                 else: |                 else: | ||||||
|                     if self.classifier_version > 0: |                     if self.classifier_version > 0: | ||||||
|                         logger.info("Classifier updated on disk, reloading classifier models") |                         logger.info("Classifier updated on disk, " | ||||||
|  |                                     "reloading classifier models") | ||||||
|                     self.data_hash = pickle.load(f) |                     self.data_hash = pickle.load(f) | ||||||
|                     self.data_vectorizer = pickle.load(f) |                     self.data_vectorizer = pickle.load(f) | ||||||
|                     self.tags_binarizer = pickle.load(f) |                     self.tags_binarizer = pickle.load(f) | ||||||
| @@ -82,20 +86,22 @@ class DocumentClassifier(object): | |||||||
|         # Step 1: Extract and preprocess training data from the database. |         # Step 1: Extract and preprocess training data from the database. | ||||||
|         logging.getLogger(__name__).debug("Gathering data from database...") |         logging.getLogger(__name__).debug("Gathering data from database...") | ||||||
|         m = hashlib.sha1() |         m = hashlib.sha1() | ||||||
|         for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): |         for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):  # NOQA: E501 | ||||||
|             preprocessed_content = preprocess_content(doc.content) |             preprocessed_content = preprocess_content(doc.content) | ||||||
|             m.update(preprocessed_content.encode('utf-8')) |             m.update(preprocessed_content.encode('utf-8')) | ||||||
|             data.append(preprocessed_content) |             data.append(preprocessed_content) | ||||||
|  |  | ||||||
|             y = -1 |             y = -1 | ||||||
|             if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO: |             dt = doc.document_type | ||||||
|                 y = doc.document_type.pk |             if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO: | ||||||
|  |                 y = dt.pk | ||||||
|             m.update(y.to_bytes(4, 'little', signed=True)) |             m.update(y.to_bytes(4, 'little', signed=True)) | ||||||
|             labels_document_type.append(y) |             labels_document_type.append(y) | ||||||
|  |  | ||||||
|             y = -1 |             y = -1 | ||||||
|             if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO: |             cor = doc.correspondent | ||||||
|                 y = doc.correspondent.pk |             if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO: | ||||||
|  |                 y = cor.pk | ||||||
|             m.update(y.to_bytes(4, 'little', signed=True)) |             m.update(y.to_bytes(4, 'little', signed=True)) | ||||||
|             labels_correspondent.append(y) |             labels_correspondent.append(y) | ||||||
|  |  | ||||||
| @@ -145,7 +151,7 @@ class DocumentClassifier(object): | |||||||
|         # Step 3: train the classifiers |         # Step 3: train the classifiers | ||||||
|         if num_tags > 0: |         if num_tags > 0: | ||||||
|             logging.getLogger(__name__).debug("Training tags classifier...") |             logging.getLogger(__name__).debug("Training tags classifier...") | ||||||
|             self.tags_classifier = MLPClassifier(verbose=True, tol=0.01) |             self.tags_classifier = MLPClassifier(tol=0.01) | ||||||
|             self.tags_classifier.fit(data_vectorized, labels_tags_vectorized) |             self.tags_classifier.fit(data_vectorized, labels_tags_vectorized) | ||||||
|         else: |         else: | ||||||
|             self.tags_classifier = None |             self.tags_classifier = None | ||||||
| @@ -157,7 +163,7 @@ class DocumentClassifier(object): | |||||||
|             logging.getLogger(__name__).debug( |             logging.getLogger(__name__).debug( | ||||||
|                 "Training correspondent classifier..." |                 "Training correspondent classifier..." | ||||||
|             ) |             ) | ||||||
|             self.correspondent_classifier = MLPClassifier(verbose=True, tol=0.01) |             self.correspondent_classifier = MLPClassifier(tol=0.01) | ||||||
|             self.correspondent_classifier.fit( |             self.correspondent_classifier.fit( | ||||||
|                 data_vectorized, |                 data_vectorized, | ||||||
|                 labels_correspondent |                 labels_correspondent | ||||||
| @@ -173,7 +179,7 @@ class DocumentClassifier(object): | |||||||
|             logging.getLogger(__name__).debug( |             logging.getLogger(__name__).debug( | ||||||
|                 "Training document type classifier..." |                 "Training document type classifier..." | ||||||
|             ) |             ) | ||||||
|             self.document_type_classifier = MLPClassifier(verbose=True, tol=0.01) |             self.document_type_classifier = MLPClassifier(tol=0.01) | ||||||
|             self.document_type_classifier.fit( |             self.document_type_classifier.fit( | ||||||
|                 data_vectorized, |                 data_vectorized, | ||||||
|                 labels_document_type |                 labels_document_type | ||||||
|   | |||||||
| @@ -65,25 +65,24 @@ def many_to_dictionary(field): | |||||||
|     return mydictionary |     return mydictionary | ||||||
|  |  | ||||||
|  |  | ||||||
| def generate_filename(document): | def generate_filename(doc): | ||||||
|     # Create filename based on configured format |  | ||||||
|     path = "" |     path = "" | ||||||
|  |  | ||||||
|     try: |     try: | ||||||
|         if settings.PAPERLESS_FILENAME_FORMAT is not None: |         if settings.PAPERLESS_FILENAME_FORMAT is not None: | ||||||
|             tags = defaultdict(lambda: slugify(None), |             tags = defaultdict(lambda: slugify(None), | ||||||
|                                many_to_dictionary(document.tags)) |                                many_to_dictionary(doc.tags)) | ||||||
|             path = settings.PAPERLESS_FILENAME_FORMAT.format( |             path = settings.PAPERLESS_FILENAME_FORMAT.format( | ||||||
|                 correspondent=slugify(document.correspondent), |                 correspondent=slugify(doc.correspondent), | ||||||
|                 title=slugify(document.title), |                 title=slugify(doc.title), | ||||||
|                 created=slugify(document.created), |                 created=slugify(doc.created), | ||||||
|                 created_year=document.created.year if document.created else "none", |                 created_year=doc.created.year if doc.created else "none", | ||||||
|                 created_month=document.created.month if document.created else "none", |                 created_month=doc.created.month if doc.created else "none", | ||||||
|                 created_day=document.created.day if document.created else "none", |                 created_day=doc.created.day if doc.created else "none", | ||||||
|                 added=slugify(document.added), |                 added=slugify(doc.added), | ||||||
|                 added_year=document.added.year if document.added else "none", |                 added_year=doc.added.year if doc.added else "none", | ||||||
|                 added_month=document.added.month if document.added else "none", |                 added_month=doc.added.month if doc.added else "none", | ||||||
|                 added_day=document.added.day if document.added else "none", |                 added_day=doc.added.day if doc.added else "none", | ||||||
|                 tags=tags, |                 tags=tags, | ||||||
|             ) |             ) | ||||||
|     except (ValueError, KeyError, IndexError): |     except (ValueError, KeyError, IndexError): | ||||||
| @@ -93,12 +92,12 @@ def generate_filename(document): | |||||||
|  |  | ||||||
|     # Always append the primary key to guarantee uniqueness of filename |     # Always append the primary key to guarantee uniqueness of filename | ||||||
|     if len(path) > 0: |     if len(path) > 0: | ||||||
|         filename = "%s-%07i%s" % (path, document.pk, document.file_type) |         filename = "%s-%07i%s" % (path, doc.pk, doc.file_type) | ||||||
|     else: |     else: | ||||||
|         filename = "%07i%s" % (document.pk, document.file_type) |         filename = "%07i%s" % (doc.pk, doc.file_type) | ||||||
|  |  | ||||||
|     # Append .gpg for encrypted files |     # Append .gpg for encrypted files | ||||||
|     if document.storage_type == document.STORAGE_TYPE_GPG: |     if doc.storage_type == doc.STORAGE_TYPE_GPG: | ||||||
|         filename += ".gpg" |         filename += ".gpg" | ||||||
|  |  | ||||||
|     return filename |     return filename | ||||||
|   | |||||||
| @@ -12,7 +12,10 @@ def match_correspondents(document_content, classifier): | |||||||
|         pred_id = None |         pred_id = None | ||||||
|  |  | ||||||
|     correspondents = Correspondent.objects.all() |     correspondents = Correspondent.objects.all() | ||||||
|     return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id] |  | ||||||
|  |     return list(filter( | ||||||
|  |         lambda o: matches(o, document_content) or o.pk == pred_id, | ||||||
|  |         correspondents)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def match_document_types(document_content, classifier): | def match_document_types(document_content, classifier): | ||||||
| @@ -22,15 +25,23 @@ def match_document_types(document_content, classifier): | |||||||
|         pred_id = None |         pred_id = None | ||||||
|  |  | ||||||
|     document_types = DocumentType.objects.all() |     document_types = DocumentType.objects.all() | ||||||
|     return [o for o in document_types if matches(o, document_content) or o.pk == pred_id] |  | ||||||
|  |     return list(filter( | ||||||
|  |         lambda o: matches(o, document_content) or o.pk == pred_id, | ||||||
|  |         document_types)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def match_tags(document_content, classifier): | def match_tags(document_content, classifier): | ||||||
|     objects = Tag.objects.all() |     if classifier: | ||||||
|     predicted_tag_ids = classifier.predict_tags(document_content) if classifier else [] |         predicted_tag_ids = classifier.predict_tags(document_content) | ||||||
|  |     else: | ||||||
|  |         predicted_tag_ids = [] | ||||||
|  |  | ||||||
|     matched_tags = [o for o in objects if matches(o, document_content) or o.pk in predicted_tag_ids] |     tags = Tag.objects.all() | ||||||
|     return matched_tags |  | ||||||
|  |     return list(filter( | ||||||
|  |         lambda o: matches(o, document_content) or o.pk in predicted_tag_ids, | ||||||
|  |         tags)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def matches(matching_model, document_content): | def matches(matching_model, document_content): | ||||||
| @@ -48,38 +59,44 @@ def matches(matching_model, document_content): | |||||||
|     if matching_model.matching_algorithm == MatchingModel.MATCH_ALL: |     if matching_model.matching_algorithm == MatchingModel.MATCH_ALL: | ||||||
|         for word in _split_match(matching_model): |         for word in _split_match(matching_model): | ||||||
|             search_result = re.search( |             search_result = re.search( | ||||||
|                 r"\b{}\b".format(word), document_content, **search_kwargs) |                 rf"\b{word}\b", document_content, **search_kwargs) | ||||||
|             if not search_result: |             if not search_result: | ||||||
|                 return False |                 return False | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|     if matching_model.matching_algorithm == MatchingModel.MATCH_ANY: |     elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY: | ||||||
|         for word in _split_match(matching_model): |         for word in _split_match(matching_model): | ||||||
|             if re.search(r"\b{}\b".format(word), document_content, **search_kwargs): |             if re.search(rf"\b{word}\b", document_content, **search_kwargs): | ||||||
|                 return True |                 return True | ||||||
|         return False |         return False | ||||||
|  |  | ||||||
|     if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL: |     elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL: | ||||||
|         return bool(re.search( |         return bool(re.search( | ||||||
|             r"\b{}\b".format(matching_model.match), document_content, **search_kwargs)) |             rf"\b{matching_model.match}\b", | ||||||
|  |             document_content, | ||||||
|  |             **search_kwargs | ||||||
|  |         )) | ||||||
|  |  | ||||||
|     if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX: |     elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX: | ||||||
|         return bool(re.search( |         return bool(re.search( | ||||||
|             re.compile(matching_model.match, **search_kwargs), document_content)) |             re.compile(matching_model.match, **search_kwargs), | ||||||
|  |             document_content | ||||||
|  |         )) | ||||||
|  |  | ||||||
|     if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY: |     elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY: | ||||||
|         match = re.sub(r'[^\w\s]', '', matching_model.match) |         match = re.sub(r'[^\w\s]', '', matching_model.match) | ||||||
|         text = re.sub(r'[^\w\s]', '', document_content) |         text = re.sub(r'[^\w\s]', '', document_content) | ||||||
|         if matching_model.is_insensitive: |         if matching_model.is_insensitive: | ||||||
|             match = match.lower() |             match = match.lower() | ||||||
|             text = text.lower() |             text = text.lower() | ||||||
|  |  | ||||||
|         return True if fuzz.partial_ratio(match, text) >= 90 else False |         return fuzz.partial_ratio(match, text) >= 90 | ||||||
|  |  | ||||||
|     if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO: |     elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO: | ||||||
|         # this is done elsewhere. |         # this is done elsewhere. | ||||||
|         return False |         return False | ||||||
|  |  | ||||||
|  |     else: | ||||||
|         raise NotImplementedError("Unsupported matching algorithm") |         raise NotImplementedError("Unsupported matching algorithm") | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -9,7 +9,7 @@ class AngularApiAuthenticationOverride(authentication.BaseAuthentication): | |||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     def authenticate(self, request): |     def authenticate(self, request): | ||||||
|         if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'): |         if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'):  # NOQA: E501 | ||||||
|             user = User.objects.filter(is_staff=True).first() |             user = User.objects.filter(is_staff=True).first() | ||||||
|             print("Auto-Login with user {}".format(user)) |             print("Auto-Login with user {}".format(user)) | ||||||
|             return (user, None) |             return (user, None) | ||||||
|   | |||||||
| @@ -283,8 +283,8 @@ class MailAccountHandler(LoggingMixin): | |||||||
|                     path=temp_filename, |                     path=temp_filename, | ||||||
|                     override_filename=att.filename, |                     override_filename=att.filename, | ||||||
|                     override_title=title, |                     override_title=title, | ||||||
|                     override_correspondent_id=correspondent.id if correspondent else None, |                     override_correspondent_id=correspondent.id if correspondent else None,  # NOQA: E501 | ||||||
|                     override_document_type_id=doc_type.id if doc_type else None, |                     override_document_type_id=doc_type.id if doc_type else None,  # NOQA: E501 | ||||||
|                     override_tag_ids=[tag.id] if tag else None, |                     override_tag_ids=[tag.id] if tag else None, | ||||||
|                     task_name=f"Mail: {att.filename}" |                     task_name=f"Mail: {att.filename}" | ||||||
|                 ) |                 ) | ||||||
|   | |||||||
| @@ -123,7 +123,7 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                 ocr_pages = self._complete_ocr_default_language( |                 ocr_pages = self._complete_ocr_default_language( | ||||||
|                     images, sample_page_index, sample_page_text) |                     images, sample_page_index, sample_page_text) | ||||||
|  |  | ||||||
|             elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): |             elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():  # NOQA: E501 | ||||||
|                 self.log( |                 self.log( | ||||||
|                     "warning", |                     "warning", | ||||||
|                     f"Detected language {guessed_language} is not available " |                     f"Detected language {guessed_language} is not available " | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jonas Winkler
					Jonas Winkler