Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added F1 measures with confusion matrix #1623

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion deeppavlov/core/common/metrics_registry.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
"elmo_loss2ppl": "deeppavlov.metrics.elmo_metrics:elmo_loss2ppl",
"f1": "deeppavlov.metrics.fmeasure:round_f1",
"f1_macro": "deeppavlov.metrics.fmeasure:round_f1_macro",
"f1_macro_with_confusion_matrix": "deeppavlov.metrics.fmeasure:round_f1_macro_with_confusion_matrix",
"f1_weighted": "deeppavlov.metrics.fmeasure:round_f1_weighted",
"f1_weighted_with_confusion_matrix": "deeppavlov.metrics.fmeasure:round_f1_weighted_with_confusion_matrix",
"google_bleu": "deeppavlov.metrics.bleu:google_bleu",
"kbqa_accuracy": "deeppavlov.metrics.accuracy:kbqa_accuracy",
"log_loss": "deeppavlov.metrics.log_loss:sk_log_loss",
Expand Down Expand Up @@ -40,4 +42,4 @@
"squad_v2_f1": "deeppavlov.metrics.squad_metrics:squad_v2_f1",
"record_f1_score": "deeppavlov.metrics.record_metrics:record_f1_score",
"record_em_score": "deeppavlov.metrics.record_metrics:record_em_score"
}
}
74 changes: 60 additions & 14 deletions deeppavlov/metrics/fmeasure.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
from logging import getLogger

import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import f1_score, confusion_matrix, multilabel_confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer

from deeppavlov.core.common.metrics_registry import register_metric

Expand Down Expand Up @@ -203,44 +204,89 @@ def round_f1(y_true, y_predicted):
return f1_score(y_true, predictions)


@register_metric('f1_macro')
def round_f1_macro(y_true, y_predicted):
"""
Calculates F1 macro measure.

def _f1_macro_weighted(y_true, y_predicted,print_matrix=False, average='macro'):
"""
Function to calculate f1 macro and f1 weighted. Used in the metrics.
Args:
y_true: list of true values
y_predicted: list of predicted values

Returns:
F1 score
"""
try:
predictions = [np.round(x) for x in y_predicted]
except TypeError:
predictions = y_predicted
if not len(y_true) and not len(y_predicted):
# y_true and y_predicted are empty lists. This situation can happen in multitask setting
return -1
if all(isinstance(k, list) for k in y_true):
mlb = MultiLabelBinarizer(sparse_output=False)
mlb.fit(y_true + y_predicted)
y_true_binarized = mlb.transform(y_true)
y_predicted_binarized = mlb.transform(y_predicted)
f_score = f1_score(np.array(y_true_binarized), np.array(y_predicted_binarized), average=average)
if print_matrix:
print(multilabel_confusion_matrix(np.array(y_true_binarized), np.array(y_predicted_binarized)).tolist())
else:
f_score = f1_score(np.array(y_true), np.array(y_predicted), average=average)
if print_matrix:
print(confusion_matrix(np.array(y_true), np.array(y_predicted)).tolist())
return f_score

return f1_score(np.array(y_true), np.array(predictions), average="macro")

@register_metric('f1_macro_with_confusion_matrix')
def round_f1_macro_with_confusion_matrix(y_true, y_predicted):
"""
Calculates F1 macro measure and prints confusion matrix.
Args:
y_true: list of true values
y_predicted: list of predicted values
Returns:
F1 score
"""
return _f1_macro_weighted(y_true, y_predicted, print_matrix=True, average='macro')


@register_metric('f1_macro')
def round_f1_macro(y_true, y_predicted):
"""
Calculates F1 macro measure.
Args:
y_true: list of true values
y_predicted: list of predicted values
Returns:
F1 score
"""
return _f1_macro_weighted(y_true, y_predicted, print_matrix=False,average='macro')


@register_metric('f1_weighted_with_confusion_matrix')
def round_f1_weighted_with_confusion_matrix(y_true, y_predicted):
"""
Calculates F1 weighted measure and prints confusion matrix.
Args:
y_true: list of true values
y_predicted: list of predicted values
Returns:
F1 score
"""
return _f1_macro_weighted(y_true, y_predicted, print_matrix=True,average='weighted')


@register_metric('f1_weighted')
def round_f1_weighted(y_true, y_predicted):
"""
Calculates F1 weighted measure.

Args:
y_true: list of true values
y_predicted: list of predicted values

Returns:
F1 score
"""
try:
predictions = [np.round(x) for x in y_predicted]
except TypeError:
predictions = y_predicted
return _f1_macro_weighted(y_true, y_predicted, print_matrix=False,average='weighted')

return f1_score(np.array(y_true), np.array(predictions), average="weighted")


def chunk_finder(current_token, previous_token, tag):
Expand Down