justinxzhao commited on
Commit
bd4620c
1 Parent(s): 30e2346

Add leaderboard graph and update about page.

Browse files
Files changed (2) hide show
  1. app.py +69 -7
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,6 +4,7 @@ from PIL import Image
4
  import base64
5
  from io import BytesIO
6
  import random
 
7
 
8
  # Define constants
9
  MAJOR_A_WIN = "A>>B"
@@ -173,6 +174,63 @@ tabs = st.tabs(
173
  # Define content for each tab
174
  with tabs[0]:
175
  _, mid_column, _ = st.columns([0.2, 0.6, 0.2])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  mid_column.dataframe(df_leaderboard)
177
 
178
 
@@ -503,14 +561,18 @@ with tabs[2]:
503
  st.write("Check out the paper for more detailed analysis!")
504
 
505
  with tabs[-1]:
506
- st.write(
507
- """
508
- Please reach out if you are interested in collaborating!
 
509
 
510
- **Our Team:**
511
- - Justin Zhao (justinxzhao@gmail.com)
512
- - Flor Plaza (flor.[email protected])
513
- - Amanda Cercas Curry (amanda.[email protected])
 
 
 
514
  """
515
  )
516
 
 
4
  import base64
5
  from io import BytesIO
6
  import random
7
+ import plotly.graph_objects as go
8
 
9
  # Define constants
10
  MAJOR_A_WIN = "A>>B"
 
174
  # Define content for each tab
175
  with tabs[0]:
176
  _, mid_column, _ = st.columns([0.2, 0.6, 0.2])
177
+ mid_column.markdown("#### Leaderboard Graph")
178
+
179
+ df = df_leaderboard.copy()
180
+ df["Score"] = df["Council Arena EI Score (95% CI)"].apply(
181
+ lambda x: float(x.split(" ")[0])
182
+ )
183
+ df["Lower"] = df["Council Arena EI Score (95% CI)"].apply(
184
+ lambda x: float(x.split(" ")[1][1:-1])
185
+ )
186
+ df["Upper"] = df["Council Arena EI Score (95% CI)"].apply(
187
+ lambda x: float(x.split(" ")[2][:-1])
188
+ )
189
+
190
+ # Sort the DataFrame by Score in descending order
191
+ df = df.sort_values(by="Score", ascending=False)
192
+
193
+ # Create the bar chart
194
+ fig = go.Figure()
195
+
196
+ # Generate rainbow colors
197
+ num_bars = len(df)
198
+ colors = [f"hsl({int(360 / num_bars * i)}, 100%, 50%)" for i in range(num_bars)]
199
+
200
+ fig.add_trace(
201
+ go.Bar(
202
+ x=df["Score"],
203
+ y=df["LLM"],
204
+ orientation="h",
205
+ error_x=dict(
206
+ type="data",
207
+ array=df["Upper"],
208
+ arrayminus=-1 * df["Lower"],
209
+ thickness=0.5,
210
+ width=3,
211
+ color="black",
212
+ ),
213
+ marker=dict(color=colors, opacity=0.8),
214
+ )
215
+ )
216
+
217
+ fig.update_layout(
218
+ xaxis=dict(title="Council Emotional Intelligence Score", showgrid=True),
219
+ yaxis_title="LLM",
220
+ yaxis=dict(autorange="reversed"),
221
+ template="presentation",
222
+ width=1000,
223
+ height=700,
224
+ )
225
+
226
+ # Display the plot in Streamlit
227
+ mid_column.plotly_chart(fig)
228
+
229
+ mid_column.divider()
230
+
231
+ mid_column.markdown("#### Leaderboard Table")
232
+
233
+ # Display the table.
234
  mid_column.dataframe(df_leaderboard)
235
 
236
 
 
561
  st.write("Check out the paper for more detailed analysis!")
562
 
563
  with tabs[-1]:
564
+ st.markdown(
565
+ """**Motivation**:
566
+
567
+ Good LLM evaluations are [really hard](https://www.jasonwei.net/blog/evals), and newly released models often make their own claims about being the best at something, often citing its position on a benchmark or a leaderboard. But what if we let the models themselves decide who's the best?
568
 
569
+ **Main collaborators**:
570
+ - [Justin Zhao](https://x.com/justinxzhao)
571
+ - [Flor Plaza](https://x.com/florplaza22)
572
+ - [Sam Paech](https://x.com/sam_paech)
573
+ - [Federico Bianchi](https://x.com/federicobianchy)
574
+ - [Sahand Sabour](https://x.com/SahandSabour)
575
+ - [Amanda Cercas Curry](https://x.com/CurriedAmanda)
576
  """
577
  )
578
 
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- streamlit
 
 
1
+ streamlit
2
+ plotly