jordyvl commited on
Commit
3f722df
1 Parent(s): 1519667

push to have sliders - kwargs - example data and reliability plot

Browse files
Files changed (3) hide show
  1. app.py +161 -12
  2. ece.py +48 -3
  3. tests.py +5 -32
app.py CHANGED
@@ -1,24 +1,173 @@
1
  import evaluate
 
 
 
 
 
2
  from evaluate.utils import launch_gradio_widget
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- module = evaluate.load("jordyvl/ece")
5
- launch_gradio_widget(module)
6
 
7
  """
8
  DEV: #might be nice to also plot reliability diagram
9
  have sliders for kwargs :)
10
 
11
- import gradio as gr
12
 
13
  metric = ECE()
14
 
15
- iface = gr.Interface(
16
- fn=compute,
17
- inputs=gr.inputs.Dataframe(headers=["predictions", "references"], col_width=2, datatype="number"),
18
- outputs=gr.outputs.Textbox(label="accuracy"),
19
- description=metric.info.description,
20
- article=metric.info.citation,
21
- )
22
 
23
- iface.launch()
24
- """
 
1
  import evaluate
2
+ import numpy as np
3
+ import pandas as pd
4
+ import ast
5
+ import json
6
+ import gradio as gr
7
  from evaluate.utils import launch_gradio_widget
8
+ from ece import ECE
9
+
10
+
11
+ sliders = [
12
+ gr.Slider(0, 100, value=10, label="n_bins"),
13
+ gr.Slider(0, 100, value=None, label="bin_range", visible=False), #DEV: need to have a double slider
14
+ gr.Dropdown(choices=["equal-range", "equal-mass"], value="equal-range", label="scheme"),
15
+ gr.Dropdown(choices=["upper-edge", "center"], value="upper-edge", label="proxy"),
16
+ gr.Dropdown(choices=[1, 2, np.inf], value=1, label="p"),
17
+ ]
18
+
19
+ slider_defaults = [slider.value for slider in sliders]
20
+
21
+ # example data
22
+ df = dict()
23
+ df["predictions"] = [[0.6, 0.2, 0.2], [0, 0.95, 0.05], [0.7, 0.1, 0.2]]
24
+ df["references"] = [0, 1, 2]
25
+
26
+ component = gr.inputs.Dataframe(
27
+ headers=["predictions", "references"], col_count=2, datatype="number", type="pandas"
28
+ )
29
+
30
+ component.value = [
31
+ [[0.6, 0.2, 0.2], 0],
32
+ [[0.7, 0.1, 0.2], 2],
33
+ [[0, 0.95, 0.05], 1],
34
+ ]
35
+ sample_data = [[component] + slider_defaults] ##json.dumps(df)
36
+
37
+
38
+ metric = ECE()
39
+ # module = evaluate.load("jordyvl/ece")
40
+ # launch_gradio_widget(module)
41
+
42
+ """
43
+ Switch inputs and compute_fn
44
+ """
45
+
46
+ def reliability_plot(results):
47
+ #CE, calibrated_acc, empirical_acc, weights_ece
48
+ #{"ECE": ECE[0], "y_bar": ECE[1], "p_bar": ECE[2], "bin_freq": ECE[3]}
49
+ import matplotlib.pyplot as plt
50
+ import seaborn as sns
51
+ sns.set_style('white')
52
+ sns.set_context("paper", font_scale=1) # 2
53
+ # plt.rcParams['figure.figsize'] = [10, 7]
54
+ plt.rcParams['figure.dpi'] = 300
55
+
56
+
57
+ fig = plt.figure()
58
+ ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
59
+ ax2 = plt.subplot2grid((3, 1), (2, 0))
60
+
61
+ n_bins = len(results["y_bar"])
62
+ bin_range = [
63
+ results["y_bar"][0] - results["y_bar"][0],
64
+ results["y_bar"][-1],
65
+ ] # np.linspace(0, 1, n_bins)
66
+ # if upper edge then minus binsize; same for center [but half]
67
+
68
+ ax1.plot(
69
+ np.linspace(bin_range[0], bin_range[1], n_bins),
70
+ np.linspace(bin_range[0], bin_range[1], n_bins),
71
+ color="darkgreen",
72
+ ls="dotted",
73
+ label="Perfect",
74
+ )
75
+ # ax1.plot(results["y_bar"], results["y_bar"], color="darkblue", label="Perfect")
76
+
77
+ anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
78
+ bin_freqs = np.zeros(n_bins)
79
+ bin_freqs[anindices] = results["bin_freq"]
80
+ ax2.hist(results["y_bar"], results["y_bar"], weights=bin_freqs)
81
+
82
+ widths = np.diff(results["y_bar"])
83
+ for j, bin in enumerate(results["y_bar"]):
84
+ perfect = results["y_bar"][j]
85
+ empirical = results["p_bar"][j]
86
+
87
+ if np.isnan(empirical):
88
+ continue
89
+
90
+ ax1.bar([perfect], height=[empirical], width=-widths[j], align="edge", color="lightblue")
91
+
92
+ if perfect == empirical:
93
+ continue
94
+
95
+ acc_plt = ax2.axvline(
96
+ x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy"
97
+ )
98
+ conf_plt = ax2.axvline(
99
+ x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
100
+ )
101
+ ax2.legend(handles=[acc_plt, conf_plt])
102
+
103
+ #Bin differences
104
+ ax1.set_ylabel("Conditional Expectation")
105
+ ax1.set_ylim([-0.05, 1.05]) #respective to bin range
106
+ ax1.legend(loc="lower right")
107
+ ax1.set_title("Reliability Diagram")
108
+
109
+ #Bin frequencies
110
+ ax2.set_xlabel("Confidence")
111
+ ax2.set_ylabel("Count")
112
+ ax2.legend(loc="upper left")#, ncol=2
113
+ plt.tight_layout()
114
+ return fig
115
+
116
+ def compute_and_plot(data, n_bins, bin_range, scheme, proxy, p):
117
+ # DEV: check on invalid datatypes with better warnings
118
+
119
+ if isinstance(data, pd.DataFrame):
120
+ data.dropna(inplace=True)
121
+
122
+ predictions = [
123
+ ast.literal_eval(prediction) if not isinstance(prediction, list) else prediction
124
+ for prediction in data["predictions"]
125
+ ]
126
+ references = [reference for reference in data["references"]]
127
+
128
+ results = metric._compute(
129
+ predictions,
130
+ references,
131
+ n_bins=n_bins,
132
+ # bin_range=None,#not needed
133
+ scheme=scheme,
134
+ proxy=proxy,
135
+ p=p,
136
+ detail=True,
137
+ )
138
+
139
+ plot = reliability_plot(results)
140
+ return results["ECE"], plt.gcf()
141
+
142
+
143
+ outputs = [gr.outputs.Textbox(label="ECE"), gr.outputs.Plot(label="Reliability diagram")]
144
+
145
+ iface = gr.Interface(
146
+ fn=compute_and_plot,
147
+ inputs=[component] + sliders,
148
+ outputs=outputs,
149
+ description=metric.info.description,
150
+ article=metric.info.citation,
151
+ # examples=sample_data
152
+ )
153
+
154
+ # ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
155
+
156
+ iface.launch()
157
+
158
+ # dict = {"ECE": ECE[0], "y_bar": ECE[1], "p_bar": ECE[2], "bin_freq": ECE[3]}
159
+
160
+ # references=[0, 1, 2], predictions=)
161
+ # https://gradio.app/getting_started/#multiple-inputs-and-outputs
162
+ ## fix with sliders for all kwargs
163
 
 
 
164
 
165
  """
166
  DEV: #might be nice to also plot reliability diagram
167
  have sliders for kwargs :)
168
 
 
169
 
170
  metric = ECE()
171
 
 
 
 
 
 
 
 
172
 
173
+ """
 
ece.py CHANGED
@@ -18,6 +18,8 @@
18
  import evaluate
19
  import datasets
20
  import numpy as np
 
 
21
 
22
 
23
  # TODO: Add BibTeX citation
@@ -161,7 +163,7 @@ def bin_calibrated_accuracy(bins, proxy="upper-edge"):
161
  return bins[:-1] + np.diff(bins) / 2
162
 
163
 
164
- def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge"):
165
  """
166
  y_correct: binary (N x 1)
167
  P: normalized (N x 1) either max or per class
@@ -187,6 +189,8 @@ def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge"):
187
  elif np.isinf(p): # max-ECE
188
  CE = np.max(abs(empirical_acc[anindices] - calibrated_acc[anindices]))
189
 
 
 
190
  return CE
191
 
192
 
@@ -196,7 +200,10 @@ def top_1_CE(Y, P, **kwargs):
196
  bins = create_bins(
197
  n_bins=kwargs["n_bins"], bin_range=kwargs["bin_range"], scheme=kwargs["scheme"], P=p_max
198
  )
199
- return CE_estimate(y_correct, p_max, bins=bins, proxy=kwargs["proxy"])
 
 
 
200
 
201
 
202
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
@@ -230,7 +237,14 @@ class ECE(evaluate.EvaluationModule):
230
  )
231
 
232
  def init_kwargs(
233
- self, n_bins=10, bin_range=None, scheme="equal-range", proxy="upper-edge", p=1, **kwargs
 
 
 
 
 
 
 
234
  ):
235
  # super(evaluate.EvaluationModule, self).__init__(**kwargs)
236
  self.n_bins = n_bins
@@ -238,6 +252,7 @@ class ECE(evaluate.EvaluationModule):
238
  self.scheme = scheme
239
  self.proxy = proxy
240
  self.p = p
 
241
 
242
  def _compute(self, predictions, references, **kwargs):
243
 
@@ -266,6 +281,36 @@ class ECE(evaluate.EvaluationModule):
266
 
267
  """Returns the scores"""
268
  ECE = top_1_CE(references, predictions, **self.__dict__)
 
 
269
  return {
270
  "ECE": ECE,
271
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  import evaluate
19
  import datasets
20
  import numpy as np
21
+ from typing import Dict, Optional
22
+
23
 
24
 
25
  # TODO: Add BibTeX citation
 
163
  return bins[:-1] + np.diff(bins) / 2
164
 
165
 
166
+ def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge", detail=False):
167
  """
168
  y_correct: binary (N x 1)
169
  P: normalized (N x 1) either max or per class
 
189
  elif np.isinf(p): # max-ECE
190
  CE = np.max(abs(empirical_acc[anindices] - calibrated_acc[anindices]))
191
 
192
+ if detail:
193
+ return CE, calibrated_acc, empirical_acc, weights_ece
194
  return CE
195
 
196
 
 
200
  bins = create_bins(
201
  n_bins=kwargs["n_bins"], bin_range=kwargs["bin_range"], scheme=kwargs["scheme"], P=p_max
202
  )
203
+ CE = CE_estimate(y_correct, p_max, bins=bins, proxy=kwargs["proxy"], detail=kwargs["detail"])
204
+ if self.detail:
205
+ return {"ECE": CE[0], "y_bar": CE[1], "p_bar": CE[2], "bin_freq": CE[3], "p_bar_cont": np.mean(p_max,-1), "accuracy": np.mean(y_correct)}
206
+ return CE
207
 
208
 
209
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 
237
  )
238
 
239
  def init_kwargs(
240
+ self,
241
+ n_bins: int = 10,
242
+ bin_range: Optional[int] = [0, 1],
243
+ scheme: str = "equal-range",
244
+ proxy: str = "upper-edge",
245
+ p=1,
246
+ detail: bool = False,
247
+ **kwargs,
248
  ):
249
  # super(evaluate.EvaluationModule, self).__init__(**kwargs)
250
  self.n_bins = n_bins
 
252
  self.scheme = scheme
253
  self.proxy = proxy
254
  self.p = p
255
+ self.detail = detail
256
 
257
  def _compute(self, predictions, references, **kwargs):
258
 
 
281
 
282
  """Returns the scores"""
283
  ECE = top_1_CE(references, predictions, **self.__dict__)
284
+ if self.detail:
285
+ return ECE
286
  return {
287
  "ECE": ECE,
288
  }
289
+
290
+
291
+ def test_ECE():
292
+ N = 10 # N evaluation instances {(x_i,y_i)}_{i=1}^N
293
+ K = 5 # K class problem
294
+
295
+ def random_mc_instance(concentration=1, onehot=False):
296
+ reference = np.argmax(
297
+ np.random.dirichlet(([concentration for _ in range(K)])), -1
298
+ ) # class targets
299
+ prediction = np.random.dirichlet(([concentration for _ in range(K)])) # probabilities
300
+ if onehot:
301
+ reference = np.eye(K)[np.argmax(reference, -1)]
302
+ return reference, prediction
303
+
304
+ references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
305
+ references = np.array(references, dtype=np.int64)
306
+ predictions = np.array(predictions, dtype=np.float32)
307
+ res = ECE()._compute(predictions, references)
308
+ print(f"ECE: {res['ECE']}")
309
+
310
+ res = ECE()._compute(predictions, references, detail=True)
311
+ import pdb; pdb.set_trace() # breakpoint 25274412 //
312
+
313
+ print(f"ECE: {res['ECE']}")
314
+
315
+ if __name__ == '__main__':
316
+ test_ECE()
tests.py CHANGED
@@ -1,39 +1,12 @@
1
  import numpy as np
2
 
3
  test_cases = [
 
 
4
  {
5
- "predictions": [0, 0],
6
- "references": [1, 1],
7
- "result": {"metric_score": 0}
8
  },
9
- {
10
- "predictions": [1, 1],
11
- "references": [1, 1],
12
- "result": {"metric_score": 1}
13
- },
14
- {
15
- "predictions": [1, 0],
16
- "references": [1, 1],
17
- "result": {"metric_score": 0.5}
18
- }
19
  ]
20
 
21
-
22
- def test_ECE():
23
- N = 10 # N evaluation instances {(x_i,y_i)}_{i=1}^N
24
- K = 5 # K class problem
25
-
26
- def random_mc_instance(concentration=1, onehot=False):
27
- reference = np.argmax(
28
- np.random.dirichlet(([concentration for _ in range(K)])), -1
29
- ) # class targets
30
- prediction = np.random.dirichlet(([concentration for _ in range(K)])) # probabilities
31
- if onehot:
32
- reference = np.eye(K)[np.argmax(reference, -1)]
33
- return reference, prediction
34
-
35
- references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
36
- references = np.array(references, dtype=np.int64)
37
- predictions = np.array(predictions, dtype=np.float32)
38
- res = ECE()._compute(predictions, references)
39
- print(f"ECE: {res['ECE']}")
 
1
  import numpy as np
2
 
3
  test_cases = [
4
+ {"predictions": [[0, 1], [1, 0]], "references": [1, 0], "result": {"ECE": 0}},
5
+ {"predictions": [[0, 1], [1, 0]], "references": [0, 1], "result": {"ECE": 1}},
6
  {
7
+ "predictions": [[0, 0.1, 0.9], [0.2, 0.8, 0]],
8
+ "references": [2, 0], # kwargs?
9
+ "result": {"ECE": >0<1},
10
  },
 
 
 
 
 
 
 
 
 
 
11
  ]
12