-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcomplementary_metrics.py
258 lines (189 loc) · 11 KB
/
complementary_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import numpy as np
import torch
from scipy.special import rel_entr
def ece(probs, targets, n_bins=10):
"""
ECE (expected calibration error) is a measure of calibration error. It is defined as the expected difference between the predicted probability (confidence) and the accuracy of the prediction.
It can be computed as:
ECE = 1/N * sum_i^N (|acc_i - p_i|)
where N is the number of bins, acc_i is the accuracy in bin i, and p_i is the average confidence in bin i.
bin_gap = | avg_confidence_in_bin - accuracy_in_bin |
Usually the bins are chosen to be equally spaced in terms of confidence. For example, if we have 10 bins, the first bin would be [0, 0.1), the second bin would be [0.1, 0.2), and so on. But it's not necessary. If there are unequal bin sizes, the ECE is weighted by the number of samples in each bin.
ECE = sum_i^N N_i/n * (|acc_i - p_i|)
where n is the total number of samples, and N_i is the number of samples in bin i.
References:
- Naeini, Mahdi Pakdaman, Gregory F. Cooper, and Milos Hauskrecht. "Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI 2015.
- Guo, Pleiss, Sun, and Weinberger. "On Calibration of Modern Neural Networks". ICML 2017
Args:
probs (np.array of float or list of list of float): array of probabilities of dim (n,m) where n is the number of samples and m is the number of classes
targets (np.array of int): array of ground truth labels of dim (n,) where n is the number of samples
n_bins (int): number of bins to use for computing ECE
Returns:
ece (float): Expected Calibration Error. Minimum possible value is 0. Maximum possible value is 1.0. A lower score means better calibration.
Code adapted from Guo et al. (2017)
"""
probs = np.array(probs)
targets = np.array(targets)
assert len(probs) == len(targets)
assert n_bins > 0
confidences = np.max(probs, axis=1)
predictions = np.argmax(probs, axis=1)
accuracies = predictions == targets
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
ece = 0.0
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
# Calculate bing gap or |confidence - accuracy| in each bin
in_bin = np.logical_and(confidences > bin_lower, confidences <= bin_upper)
prop_in_bin = np.mean(in_bin)
if prop_in_bin > 0:
accuracy_in_bin = np.mean(accuracies[in_bin])
avg_confidence_in_bin = np.mean(confidences[in_bin])
ece += np.abs(accuracy_in_bin - avg_confidence_in_bin) * prop_in_bin
return ece
def mce(probs, targets, n_bins=10):
"""
MCE (maximum calibration error) is a measure of calibration error. It is defined as the maximum difference between the predicted probability (confidence) and the accuracy of the prediction.
It can be computed as:
MCE = max_i (|acc_i - p_i|)
where acc_i is the accuracy in bin i, and p_i is the average confidence in bin i.
bin_gap = | avg_confidence_in_bin - accuracy_in_bin |
Usually the bins are chosen to be equally spaced in terms of confidence. For example, if we have 10 bins, the first bin would be [0, 0.1), the second bin would be [0.1, 0.2), and so on. But it's not necessary. If there are unequal bin sizes, the MCE is weighted by the number of samples in each bin.
Args:
probs (np.array of float or list of list of float): array of probabilities of dim (n,m) where n is the number of samples and m is the number of classes
targets (np.array of int): array of ground truth labels of dim (n,) where n is the number of samples
n_bins (int): number of bins to use for computing MCE
Returns:
mce (float): Maximum Calibration Error. Minimum possible value is 0. Maximum possible value is 1.0. A lower score means better calibration.
"""
probs = np.array(probs)
targets = np.array(targets)
assert len(probs) == len(targets)
assert n_bins > 0
confidences = np.max(probs, axis=1)
predictions = np.argmax(probs, axis=1)
accuracies = predictions == targets
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
bin_gaps = []
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
# Calculate bin gap or |confidence - accuracy| in each bin
in_bin = np.logical_and(confidences > bin_lower, confidences <= bin_upper)
prop_in_bin = np.mean(in_bin)
if prop_in_bin > 0:
accuracy_in_bin = np.mean(accuracies[in_bin])
avg_confidence_in_bin = np.mean(confidences[in_bin])
bin_gap = np.abs(accuracy_in_bin - avg_confidence_in_bin) * prop_in_bin
bin_gaps.append(bin_gap)
mce = np.max(bin_gaps)
return mce
def sce(probs, targets, n_bins=10):
"""
SCE (static calibration error) is a measure of calibration error. It is a simple extension of the ECE, allowing for class conditionality in the multi-class setting.
It can be computed as:
SCE = 1/K * sum_k^K sum_b^B [n_bk/N * (|acc_bk - p_bk|)
where K is the number of classes, B is the number of bins, n_bk is the number of samples in bin b for class k, N is the total number of samples, acc_bk is the accuracy in bin b for class k, and p_bk is the average confidence in bin b for class k.
References:
Nixon et al. (2019) - Measuring Calibration in Deep Learning. https://arxiv.org/abs/1904.01685
Args:
probs (np.array of float or list of list of float): array of probabilities of dim (n,m) where n is the number of samples and m is the number of classes
targets (np.array of int): array of ground truth labels of dim (n,) where n is the number of samples
n_bins (int): number of bins to use for computing SCE
Returns:
sce (float): Static Calibration Error. Minimum possible value is 0. Maximum possible value is 1.0. , SCE is guaranteed to be zero if only if the model is calibrated
"""
probs = np.array(probs)
targets = np.array(targets)
num_classes = probs.shape[1] # (assumption: class labels or targets are 0, 1, ... num_classes-1)
assert len(probs) == len(targets)
assert n_bins > 0
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
sce = 0.0
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
for classLabel in range(num_classes):
# Calculate bin gap or |confidence - accuracy| in each bin per class
in_bin = np.logical_and(probs[:,classLabel] > bin_lower, probs[:,classLabel] <= bin_upper)
prop_in_bin = np.mean(in_bin)
if prop_in_bin > 0:
accuracy_in_bin = np.mean(targets[in_bin] == classLabel)
avg_confidence_in_bin = np.mean(probs[in_bin,classLabel])
sce += np.abs(accuracy_in_bin - avg_confidence_in_bin) * prop_in_bin
sce /= num_classes
return sce
def nll(inputs, targets, logits = True):
"""
NLL (Negative Log Likelihood) is a measure of the quality of probabilistic predictions. It is defined as the average negative log probability of the true class. It is a proper scoring metric.
Args:
inputs (np.array of float or list of list of float): array of logits or probabilities of dim (n,m) where n is the number of samples and m is the number of classess
targets (np.array of int): array of ground truth labels of dim (n,) where n is the number of samples
logits (bool): whether the inputs are logits or probabilities (default: True)
Returns:
nll (float): negative log likelihood
"""
inputs = torch.tensor(inputs)
targets = torch.tensor(targets)
assert len(inputs) == len(targets)
if logits:
logits = inputs
criterion = torch.nn.CrossEntropyLoss()
loss = criterion(logits, targets)
else:
probs = inputs
criterion = torch.nn.NLLLoss()
loss = criterion(torch.log(probs), targets)
return loss.numpy()
def weighted_model_confidence(probs, targets, weights = [1.0,-1.0]):
"""
WMC (Weighted Model Confidence) is a measure of the overall model confidence using probabilistic predictions. It is defined as the weighted average confidence of the model in its predictions (rewarding / positive weight when it's confident and accurate, penalizing / negative weight when confident and inaccurate).
Args:
probs (np.array of float or list of list of float): array of probabilities of dim (n,m) where n is the number of samples and m is the number of classes
targets (np.array of int): array of ground truth labels of dim (n,) where n is the number of samples
weights (list of float): list of weights for accuracy and inaccuracy (default: [1,0])
Returns:
wmc (float): Weighted model confidence. Higher the better.
(Original metric)
"""
probs = np.array(probs)
targets = np.array(targets)
assert len(probs) == len(targets)
confidences = np.max(probs, axis=1)
predictions = np.argmax(probs, axis=1)
accuracies = predictions == targets
weight_array = np.array([weights[0] if item else weights[1] for item in accuracies])
wmc = np.dot(confidences, weight_array) / len(targets)
return wmc
def average_KL_divergence_with_uniform_distribution(probs, targets):
"""
Average of KL divergence of each data sample's probability distribution with uniform distribution across all data samples. Higher the better. Minimum possible value is 0.
Args:
probs (np.array of float or list of list of float): array of probabilities of dim (n,m) where n is the number of samples and m is the number of classes
targets (np.array of int): array of ground truth labels of dim (n,) where n is the number of samples
Returns:
avg_KL_U (float): average KL divergence of the predicted class with uniform distribution. Higher the better. Minimum possible value is 0.
"""
probs = np.array(probs)
targets = np.array(targets)
assert len(probs) == len(targets)
num_classes = probs.shape[1] # (assumption: class labels or targets range from 0, 1, ... num_classes-1)
unif = 1.0/num_classes * np.ones(probs.shape)
KL_U = np.sum(rel_entr(unif, probs) , axis = 1)
return np.mean(KL_U)
def average_pred_entropy(probs, targets):
"""
Average predictive entropy. Lower the better.
Args:
probs (np.array of float or list of list of float): array of probabilities of dim (n,m) where n is the number of samples and m is the number of classes
targets (np.array of int): array of ground truth labels of dim (n,) where n is the number of samples
Returns:
avg_pred_entropy (float): average predictive entropy. Lower the better.
"""
probs = np.array(probs)
targets = np.array(targets)
assert len(probs) == len(targets)
# calculate entropy of the predicted class
predEntropy = -1 * np.sum( np.log(probs) * probs , axis = 1)
return np.mean(predEntropy)