Spaces:

lmarena-ai
/

preference-proxy-evaluations

Running

App Files Files Community

Evan Frick commited on Oct 22

Commit

ebb58d8

•

1 Parent(s): 28a71da

qol

Browse files

Files changed (1) hide show

app.py +17 -4

app.py CHANGED Viewed

@@ -24,6 +24,10 @@ def load_data(file_path):
 def contains_list(column):
     return column.apply(lambda x: isinstance(x, list)).any()
 def main():
     # Load the JSON data
     data = load_data('results.json')
@@ -57,10 +61,19 @@ def main():
                 if isinstance(submetrics, dict):
                     for metric_name, value in submetrics.items():
                         # Create a compound key
-                        key = f"{subkey} - {metric_name}"
-                        flattened_metrics[key] = value
                 else:
                     flattened_metrics[subkey] = submetrics
             records.append({
                 "Model": model,
                 "Type": model_type,
@@ -81,7 +94,7 @@ def main():
     df = df.loc[:, ~df.apply(contains_list)]
     if "human" not in selected_benchmark:
-        df = df[sorted(df.columns, key=lambda s: s.lower() if s != "Type" else "A")]
     # Set 'Model' as the index
     df.set_index(["Model"], inplace=True)
@@ -122,7 +135,7 @@ def main():
     # Display the DataFrame
-    st.dataframe(df_display.sort_values(df_display.columns[1], ascending=False).style.background_gradient(cmap='summer_r', axis=0)
  if len(df_display) else df_display, use_container_width=True, height=500)
     # Optional: Allow user to download the data as CSV

 def contains_list(column):
     return column.apply(lambda x: isinstance(x, list)).any()
+INVERT = {'brier', 'loss'}
+SCALE = {'accuracy', 'row-wise pearson', 'confidence_agreement', 'spearman', 'kendalltau', 'arena_under_curve', 'mean_max_score', 'mean_end_score'}
 def main():
     # Load the JSON data
     data = load_data('results.json')
                 if isinstance(submetrics, dict):
                     for metric_name, value in submetrics.items():
                         # Create a compound key
+                        if metric_name in SCALE:
+                            value = 100 * value
+                        if metric_name in INVERT:
+                            key = f"{subkey} - (1 - {metric_name})"
+                            flattened_metrics[key] = 1 - value
+                        else:
+                            key = f"{subkey} - {metric_name}"
+                            flattened_metrics[key] = value
                 else:
                     flattened_metrics[subkey] = submetrics
             records.append({
                 "Model": model,
                 "Type": model_type,
     df = df.loc[:, ~df.apply(contains_list)]
     if "human" not in selected_benchmark:
+        df = df[sorted(df.columns, key=lambda s: s.replace("(1", "l").lower() if s != "Type" else "A")]
     # Set 'Model' as the index
     df.set_index(["Model"], inplace=True)
     # Display the DataFrame
+    st.dataframe(df_display.sort_values(df_display.columns[1], ascending=False).style.background_gradient(cmap='summer_r', axis=0).format(precision=4)
  if len(df_display) else df_display, use_container_width=True, height=500)
     # Optional: Allow user to download the data as CSV