Spaces:

jordyvl
/

ece

Runtime error

App Files Files Community

jordyvl commited on Jun 30, 2022

Commit

9d4511f

•

1 Parent(s): 2afab11

plt.hist might not be the right plotting device; overrides existing bins

Browse files

Files changed (3) hide show

README.md +1 -1
local_app.py +44 -20
tests.py +11 -6

README.md CHANGED Viewed

@@ -65,7 +65,7 @@ The module returns dictionary with a key value pair, e.g., {"ECE": 0.64}.
 <!---
 *Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
 -->
-```
 N = 10  # N evaluation instances {(x_i,y_i)}_{i=1}^N
 K = 5  # K class problem

 <!---
 *Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
 -->
+```python
 N = 10  # N evaluation instances {(x_i,y_i)}_{i=1}^N
 K = 5  # K class problem

local_app.py CHANGED Viewed

@@ -11,11 +11,12 @@ from ece import ECE  # loads local instead
 import matplotlib.pyplot as plt
 """
 import seaborn as sns
 sns.set_style('white')
-sns.set_context("paper", font_scale=1)  # 2
 """
 # plt.rcParams['figure.figsize'] = [10, 7]
 plt.rcParams["figure.dpi"] = 300
@@ -61,6 +62,7 @@ metric = ECE()
 Switch inputs and compute_fn
 """
 def default_plot():
     fig = plt.figure()
     ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
@@ -85,7 +87,25 @@ def default_plot():
     plt.tight_layout()
     return fig
 def reliability_plot(results):
     fig = plt.figure()
     ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
     ax2 = plt.subplot2grid((3, 1), (2, 0))
@@ -96,36 +116,36 @@ def reliability_plot(results):
         results["y_bar"][-1],
     ]  # np.linspace(0, 1, n_bins)
     # if upper edge then minus binsize; same for center [but half]
     ranged = np.linspace(bin_range[0], bin_range[1], n_bins)
     ax1.plot(
         ranged,
         ranged,
-        color="darkgreen",
         ls="dotted",
         label="Perfect",
     )
-    # ax1.plot(results["y_bar"], results["y_bar"], color="darkblue", label="Perfect")
     anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
     bin_freqs = np.zeros(n_bins)
     bin_freqs[anindices] = results["bin_freq"]
-    ax2.hist(results["y_bar"], results["y_bar"], weights=bin_freqs)
-    # widths = np.diff(results["y_bar"])
-    for j, bin in enumerate(results["y_bar"]):
-        perfect = results["y_bar"][j]
-        empirical = results["p_bar"][j]
-        if np.isnan(empirical):
-            continue
-        #width=-ranged[j],
-        ax1.bar([perfect], height=[empirical],  align="edge", color="lightblue")
-        """
-        if perfect == empirical:
-            continue
-        """
     acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
     conf_plt = ax2.axvline(
         x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
@@ -134,14 +154,18 @@ def reliability_plot(results):
     # Bin differences
     ax1.set_ylabel("Conditional Expectation")
-    ax1.set_ylim([-0.05, 1.05])  # respective to bin range
-    ax1.legend(loc="lower right")
     ax1.set_title("Reliability Diagram")
     # Bin frequencies
     ax2.set_xlabel("Confidence")
     ax2.set_ylabel("Count")
     ax2.legend(loc="upper left")  # , ncol=2
     plt.tight_layout()
     return fig
@@ -173,7 +197,7 @@ def compute_and_plot(data, n_bins, bin_range, scheme, proxy, p):
 outputs = [gr.outputs.Textbox(label="ECE"), gr.Plot(label="Reliability diagram")]
-#outputs[1].value = default_plot().__dict__
 iface = gr.Interface(
     fn=compute_and_plot,

 import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
 """
 import seaborn as sns
 sns.set_style('white')
+sns.set_context("paper", font_scale=1)
 """
 # plt.rcParams['figure.figsize'] = [10, 7]
 plt.rcParams["figure.dpi"] = 300
 Switch inputs and compute_fn
 """
 def default_plot():
     fig = plt.figure()
     ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
     plt.tight_layout()
     return fig
+def over_under_confidence(results):
+    colors = []
+    for j, bin in enumerate(results["y_bar"]):
+        perfect = results["y_bar"][j]
+        empirical = results["p_bar"][j]
+        bin_color = (
+            "limegreen"
+            if perfect == empirical
+            else "dodgerblue"
+            if empirical < perfect
+            else "orangered"
+        )
+        colors.append(bin_color)
+    return colors
 def reliability_plot(results):
+    #DEV: might still need to write tests in case of equal mass binning
     fig = plt.figure()
     ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
     ax2 = plt.subplot2grid((3, 1), (2, 0))
         results["y_bar"][-1],
     ]  # np.linspace(0, 1, n_bins)
     # if upper edge then minus binsize; same for center [but half]
+    # rwidth is dependent on the binning
+    B, bins, patches = ax1.hist(
+        results["y_bar"], weights=results["p_bar"][:-1] #rwidth=len(results["p_bar"]/len(results["p_bar"]-1 )) #, range=(0,1),
+    )  # , rwidth=1, align="right") #
+    colors = over_under_confidence(results)
+    for b in range(len(B)):
+        patches[b].set_facecolor(colors[b])  # color based on over/underconfidence
     ranged = np.linspace(bin_range[0], bin_range[1], n_bins)
     ax1.plot(
         ranged,
         ranged,
+        color="limegreen",
         ls="dotted",
         label="Perfect",
     )
+    ax1handles = [
+        mpatches.Patch(color="orangered", label="Overconfident"),
+        mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
+        mpatches.Patch(color="dodgerblue", label="Underconfident"),
+    ]
     anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
     bin_freqs = np.zeros(n_bins)
     bin_freqs[anindices] = results["bin_freq"]
+    ax2.hist(results["y_bar"], weights=bin_freqs, color="midnightblue") #bins=results["y_bar"],
+    # DEV: nicer would be to plot like a polygon
+    # see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
     acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
     conf_plt = ax2.axvline(
         x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
     # Bin differences
     ax1.set_ylabel("Conditional Expectation")
+    ax1.set_ylim([0, 1.05])  # respective to bin range
+    ax1.legend(loc="lower right", handles=ax1handles)
     ax1.set_title("Reliability Diagram")
+    # ax1.set_xticks([0]+results["y_bar"])
+    ax1.set_xlim([-0.05, 1.05])  # respective to bin range
     # Bin frequencies
     ax2.set_xlabel("Confidence")
     ax2.set_ylabel("Count")
     ax2.legend(loc="upper left")  # , ncol=2
+    # ax2.set_xticks([0, ]+results["y_bar"])
+    ax2.set_xlim([-0.05, 1.05])  # respective to bin range
     plt.tight_layout()
     return fig
 outputs = [gr.outputs.Textbox(label="ECE"), gr.Plot(label="Reliability diagram")]
+# outputs[1].value = default_plot().__dict__
 iface = gr.Interface(
     fn=compute_and_plot,

tests.py CHANGED Viewed

@@ -1,12 +1,17 @@
-import numpy as np
 test_cases = [
     {"predictions": [[0, 1], [1, 0]], "references": [1, 0], "result": {"ECE": 0}},
     {"predictions": [[0, 1], [1, 0]], "references": [0, 1], "result": {"ECE": 1}},
     {
-        "predictions": [[0, 0.1, 0.9], [0.2, 0.8, 0]],
-        "references": [2, 0],  # kwargs?
-        "result": {"ECE": >0<1},
     },
-]

 test_cases = [
     {"predictions": [[0, 1], [1, 0]], "references": [1, 0], "result": {"ECE": 0}},
     {"predictions": [[0, 1], [1, 0]], "references": [0, 1], "result": {"ECE": 1}},
     {
+        "predictions": [[0.6, 0.2, 0.2], [0, 0.95, 0.05], [0.75, 0.05 ,0.2]],
+        "references": [0, 1, 2],
+        "result": {"ECE": ((abs((0==0)-0.7) + abs((1==1)-1) + abs((2==0)-0.8))/3)},
+        #all predictions in separate bins
+    },
+    {
+        "predictions": [[0.6, 0.2, 0.2], [0, 0.95, 0.05], [0.7, 0.1 ,0.2]],
+        "references": [0, 1, 2],
+        "result": {"ECE": abs((0==0)-0.7 + (2==0)-0.7)/3 + abs((1==1)-1)/3},
+        #some predictions in same bin
     },
+# DEV: make more advanced tests including differing kwargs