mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
code cleanup
This commit is contained in:
parent
b44f8383e4
commit
450fb877f6
@ -30,10 +30,12 @@ class DocumentClassifier(object):
|
||||
FORMAT_VERSION = 5
|
||||
|
||||
def __init__(self):
|
||||
# mtime of the model file on disk. used to prevent reloading when nothing has changed.
|
||||
# mtime of the model file on disk. used to prevent reloading when
|
||||
# nothing has changed.
|
||||
self.classifier_version = 0
|
||||
|
||||
# hash of the training data. used to prevent re-training when the training data has not changed.
|
||||
# hash of the training data. used to prevent re-training when the
|
||||
# training data has not changed.
|
||||
self.data_hash = None
|
||||
|
||||
self.data_vectorizer = None
|
||||
@ -48,10 +50,12 @@ class DocumentClassifier(object):
|
||||
schema_version = pickle.load(f)
|
||||
|
||||
if schema_version != self.FORMAT_VERSION:
|
||||
raise IncompatibleClassifierVersionError("Cannor load classifier, incompatible versions.")
|
||||
raise IncompatibleClassifierVersionError(
|
||||
"Cannor load classifier, incompatible versions.")
|
||||
else:
|
||||
if self.classifier_version > 0:
|
||||
logger.info("Classifier updated on disk, reloading classifier models")
|
||||
logger.info("Classifier updated on disk, "
|
||||
"reloading classifier models")
|
||||
self.data_hash = pickle.load(f)
|
||||
self.data_vectorizer = pickle.load(f)
|
||||
self.tags_binarizer = pickle.load(f)
|
||||
@ -82,20 +86,22 @@ class DocumentClassifier(object):
|
||||
# Step 1: Extract and preprocess training data from the database.
|
||||
logging.getLogger(__name__).debug("Gathering data from database...")
|
||||
m = hashlib.sha1()
|
||||
for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True):
|
||||
for doc in Document.objects.order_by('pk').exclude(tags__is_inbox_tag=True): # NOQA: E501
|
||||
preprocessed_content = preprocess_content(doc.content)
|
||||
m.update(preprocessed_content.encode('utf-8'))
|
||||
data.append(preprocessed_content)
|
||||
|
||||
y = -1
|
||||
if doc.document_type and doc.document_type.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.document_type.pk
|
||||
dt = doc.document_type
|
||||
if dt and dt.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = dt.pk
|
||||
m.update(y.to_bytes(4, 'little', signed=True))
|
||||
labels_document_type.append(y)
|
||||
|
||||
y = -1
|
||||
if doc.correspondent and doc.correspondent.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = doc.correspondent.pk
|
||||
cor = doc.correspondent
|
||||
if cor and cor.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
y = cor.pk
|
||||
m.update(y.to_bytes(4, 'little', signed=True))
|
||||
labels_correspondent.append(y)
|
||||
|
||||
@ -145,7 +151,7 @@ class DocumentClassifier(object):
|
||||
# Step 3: train the classifiers
|
||||
if num_tags > 0:
|
||||
logging.getLogger(__name__).debug("Training tags classifier...")
|
||||
self.tags_classifier = MLPClassifier(verbose=True, tol=0.01)
|
||||
self.tags_classifier = MLPClassifier(tol=0.01)
|
||||
self.tags_classifier.fit(data_vectorized, labels_tags_vectorized)
|
||||
else:
|
||||
self.tags_classifier = None
|
||||
@ -157,7 +163,7 @@ class DocumentClassifier(object):
|
||||
logging.getLogger(__name__).debug(
|
||||
"Training correspondent classifier..."
|
||||
)
|
||||
self.correspondent_classifier = MLPClassifier(verbose=True, tol=0.01)
|
||||
self.correspondent_classifier = MLPClassifier(tol=0.01)
|
||||
self.correspondent_classifier.fit(
|
||||
data_vectorized,
|
||||
labels_correspondent
|
||||
@ -173,7 +179,7 @@ class DocumentClassifier(object):
|
||||
logging.getLogger(__name__).debug(
|
||||
"Training document type classifier..."
|
||||
)
|
||||
self.document_type_classifier = MLPClassifier(verbose=True, tol=0.01)
|
||||
self.document_type_classifier = MLPClassifier(tol=0.01)
|
||||
self.document_type_classifier.fit(
|
||||
data_vectorized,
|
||||
labels_document_type
|
||||
|
@ -65,25 +65,24 @@ def many_to_dictionary(field):
|
||||
return mydictionary
|
||||
|
||||
|
||||
def generate_filename(document):
|
||||
# Create filename based on configured format
|
||||
def generate_filename(doc):
|
||||
path = ""
|
||||
|
||||
try:
|
||||
if settings.PAPERLESS_FILENAME_FORMAT is not None:
|
||||
tags = defaultdict(lambda: slugify(None),
|
||||
many_to_dictionary(document.tags))
|
||||
many_to_dictionary(doc.tags))
|
||||
path = settings.PAPERLESS_FILENAME_FORMAT.format(
|
||||
correspondent=slugify(document.correspondent),
|
||||
title=slugify(document.title),
|
||||
created=slugify(document.created),
|
||||
created_year=document.created.year if document.created else "none",
|
||||
created_month=document.created.month if document.created else "none",
|
||||
created_day=document.created.day if document.created else "none",
|
||||
added=slugify(document.added),
|
||||
added_year=document.added.year if document.added else "none",
|
||||
added_month=document.added.month if document.added else "none",
|
||||
added_day=document.added.day if document.added else "none",
|
||||
correspondent=slugify(doc.correspondent),
|
||||
title=slugify(doc.title),
|
||||
created=slugify(doc.created),
|
||||
created_year=doc.created.year if doc.created else "none",
|
||||
created_month=doc.created.month if doc.created else "none",
|
||||
created_day=doc.created.day if doc.created else "none",
|
||||
added=slugify(doc.added),
|
||||
added_year=doc.added.year if doc.added else "none",
|
||||
added_month=doc.added.month if doc.added else "none",
|
||||
added_day=doc.added.day if doc.added else "none",
|
||||
tags=tags,
|
||||
)
|
||||
except (ValueError, KeyError, IndexError):
|
||||
@ -93,12 +92,12 @@ def generate_filename(document):
|
||||
|
||||
# Always append the primary key to guarantee uniqueness of filename
|
||||
if len(path) > 0:
|
||||
filename = "%s-%07i%s" % (path, document.pk, document.file_type)
|
||||
filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)
|
||||
else:
|
||||
filename = "%07i%s" % (document.pk, document.file_type)
|
||||
filename = "%07i%s" % (doc.pk, doc.file_type)
|
||||
|
||||
# Append .gpg for encrypted files
|
||||
if document.storage_type == document.STORAGE_TYPE_GPG:
|
||||
if doc.storage_type == doc.STORAGE_TYPE_GPG:
|
||||
filename += ".gpg"
|
||||
|
||||
return filename
|
||||
|
@ -12,7 +12,10 @@ def match_correspondents(document_content, classifier):
|
||||
pred_id = None
|
||||
|
||||
correspondents = Correspondent.objects.all()
|
||||
return [o for o in correspondents if matches(o, document_content) or o.pk == pred_id]
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk == pred_id,
|
||||
correspondents))
|
||||
|
||||
|
||||
def match_document_types(document_content, classifier):
|
||||
@ -22,15 +25,23 @@ def match_document_types(document_content, classifier):
|
||||
pred_id = None
|
||||
|
||||
document_types = DocumentType.objects.all()
|
||||
return [o for o in document_types if matches(o, document_content) or o.pk == pred_id]
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk == pred_id,
|
||||
document_types))
|
||||
|
||||
|
||||
def match_tags(document_content, classifier):
|
||||
objects = Tag.objects.all()
|
||||
predicted_tag_ids = classifier.predict_tags(document_content) if classifier else []
|
||||
if classifier:
|
||||
predicted_tag_ids = classifier.predict_tags(document_content)
|
||||
else:
|
||||
predicted_tag_ids = []
|
||||
|
||||
matched_tags = [o for o in objects if matches(o, document_content) or o.pk in predicted_tag_ids]
|
||||
return matched_tags
|
||||
tags = Tag.objects.all()
|
||||
|
||||
return list(filter(
|
||||
lambda o: matches(o, document_content) or o.pk in predicted_tag_ids,
|
||||
tags))
|
||||
|
||||
|
||||
def matches(matching_model, document_content):
|
||||
@ -48,39 +59,45 @@ def matches(matching_model, document_content):
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
|
||||
for word in _split_match(matching_model):
|
||||
search_result = re.search(
|
||||
r"\b{}\b".format(word), document_content, **search_kwargs)
|
||||
rf"\b{word}\b", document_content, **search_kwargs)
|
||||
if not search_result:
|
||||
return False
|
||||
return True
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
|
||||
for word in _split_match(matching_model):
|
||||
if re.search(r"\b{}\b".format(word), document_content, **search_kwargs):
|
||||
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
|
||||
return True
|
||||
return False
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
|
||||
return bool(re.search(
|
||||
r"\b{}\b".format(matching_model.match), document_content, **search_kwargs))
|
||||
rf"\b{matching_model.match}\b",
|
||||
document_content,
|
||||
**search_kwargs
|
||||
))
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
|
||||
return bool(re.search(
|
||||
re.compile(matching_model.match, **search_kwargs), document_content))
|
||||
re.compile(matching_model.match, **search_kwargs),
|
||||
document_content
|
||||
))
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
match = re.sub(r'[^\w\s]', '', matching_model.match)
|
||||
text = re.sub(r'[^\w\s]', '', document_content)
|
||||
if matching_model.is_insensitive:
|
||||
match = match.lower()
|
||||
text = text.lower()
|
||||
|
||||
return True if fuzz.partial_ratio(match, text) >= 90 else False
|
||||
return fuzz.partial_ratio(match, text) >= 90
|
||||
|
||||
if matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
|
||||
# this is done elsewhere.
|
||||
return False
|
||||
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
else:
|
||||
raise NotImplementedError("Unsupported matching algorithm")
|
||||
|
||||
|
||||
def _split_match(matching_model):
|
||||
|
@ -9,7 +9,7 @@ class AngularApiAuthenticationOverride(authentication.BaseAuthentication):
|
||||
"""
|
||||
|
||||
def authenticate(self, request):
|
||||
if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'):
|
||||
if settings.DEBUG and 'Referer' in request.headers and request.headers['Referer'].startswith('http://localhost:4200/'): # NOQA: E501
|
||||
user = User.objects.filter(is_staff=True).first()
|
||||
print("Auto-Login with user {}".format(user))
|
||||
return (user, None)
|
||||
|
@ -283,8 +283,8 @@ class MailAccountHandler(LoggingMixin):
|
||||
path=temp_filename,
|
||||
override_filename=att.filename,
|
||||
override_title=title,
|
||||
override_correspondent_id=correspondent.id if correspondent else None,
|
||||
override_document_type_id=doc_type.id if doc_type else None,
|
||||
override_correspondent_id=correspondent.id if correspondent else None, # NOQA: E501
|
||||
override_document_type_id=doc_type.id if doc_type else None, # NOQA: E501
|
||||
override_tag_ids=[tag.id] if tag else None,
|
||||
task_name=f"Mail: {att.filename}"
|
||||
)
|
||||
|
@ -123,7 +123,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
ocr_pages = self._complete_ocr_default_language(
|
||||
images, sample_page_index, sample_page_text)
|
||||
|
||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages():
|
||||
elif not ISO639[guessed_language] in pyocr.get_available_tools()[0].get_available_languages(): # NOQA: E501
|
||||
self.log(
|
||||
"warning",
|
||||
f"Detected language {guessed_language} is not available "
|
||||
|
Loading…
x
Reference in New Issue
Block a user