Spaces:
Runtime error
Runtime error
justinxzhao
commited on
Commit
•
bd4620c
1
Parent(s):
30e2346
Add leaderboard graph and update about page.
Browse files- app.py +69 -7
- requirements.txt +2 -1
app.py
CHANGED
@@ -4,6 +4,7 @@ from PIL import Image
|
|
4 |
import base64
|
5 |
from io import BytesIO
|
6 |
import random
|
|
|
7 |
|
8 |
# Define constants
|
9 |
MAJOR_A_WIN = "A>>B"
|
@@ -173,6 +174,63 @@ tabs = st.tabs(
|
|
173 |
# Define content for each tab
|
174 |
with tabs[0]:
|
175 |
_, mid_column, _ = st.columns([0.2, 0.6, 0.2])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
mid_column.dataframe(df_leaderboard)
|
177 |
|
178 |
|
@@ -503,14 +561,18 @@ with tabs[2]:
|
|
503 |
st.write("Check out the paper for more detailed analysis!")
|
504 |
|
505 |
with tabs[-1]:
|
506 |
-
st.
|
507 |
-
"""
|
508 |
-
|
|
|
509 |
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
|
|
|
|
|
|
514 |
"""
|
515 |
)
|
516 |
|
|
|
4 |
import base64
|
5 |
from io import BytesIO
|
6 |
import random
|
7 |
+
import plotly.graph_objects as go
|
8 |
|
9 |
# Define constants
|
10 |
MAJOR_A_WIN = "A>>B"
|
|
|
174 |
# Define content for each tab
|
175 |
with tabs[0]:
|
176 |
_, mid_column, _ = st.columns([0.2, 0.6, 0.2])
|
177 |
+
mid_column.markdown("#### Leaderboard Graph")
|
178 |
+
|
179 |
+
df = df_leaderboard.copy()
|
180 |
+
df["Score"] = df["Council Arena EI Score (95% CI)"].apply(
|
181 |
+
lambda x: float(x.split(" ")[0])
|
182 |
+
)
|
183 |
+
df["Lower"] = df["Council Arena EI Score (95% CI)"].apply(
|
184 |
+
lambda x: float(x.split(" ")[1][1:-1])
|
185 |
+
)
|
186 |
+
df["Upper"] = df["Council Arena EI Score (95% CI)"].apply(
|
187 |
+
lambda x: float(x.split(" ")[2][:-1])
|
188 |
+
)
|
189 |
+
|
190 |
+
# Sort the DataFrame by Score in descending order
|
191 |
+
df = df.sort_values(by="Score", ascending=False)
|
192 |
+
|
193 |
+
# Create the bar chart
|
194 |
+
fig = go.Figure()
|
195 |
+
|
196 |
+
# Generate rainbow colors
|
197 |
+
num_bars = len(df)
|
198 |
+
colors = [f"hsl({int(360 / num_bars * i)}, 100%, 50%)" for i in range(num_bars)]
|
199 |
+
|
200 |
+
fig.add_trace(
|
201 |
+
go.Bar(
|
202 |
+
x=df["Score"],
|
203 |
+
y=df["LLM"],
|
204 |
+
orientation="h",
|
205 |
+
error_x=dict(
|
206 |
+
type="data",
|
207 |
+
array=df["Upper"],
|
208 |
+
arrayminus=-1 * df["Lower"],
|
209 |
+
thickness=0.5,
|
210 |
+
width=3,
|
211 |
+
color="black",
|
212 |
+
),
|
213 |
+
marker=dict(color=colors, opacity=0.8),
|
214 |
+
)
|
215 |
+
)
|
216 |
+
|
217 |
+
fig.update_layout(
|
218 |
+
xaxis=dict(title="Council Emotional Intelligence Score", showgrid=True),
|
219 |
+
yaxis_title="LLM",
|
220 |
+
yaxis=dict(autorange="reversed"),
|
221 |
+
template="presentation",
|
222 |
+
width=1000,
|
223 |
+
height=700,
|
224 |
+
)
|
225 |
+
|
226 |
+
# Display the plot in Streamlit
|
227 |
+
mid_column.plotly_chart(fig)
|
228 |
+
|
229 |
+
mid_column.divider()
|
230 |
+
|
231 |
+
mid_column.markdown("#### Leaderboard Table")
|
232 |
+
|
233 |
+
# Display the table.
|
234 |
mid_column.dataframe(df_leaderboard)
|
235 |
|
236 |
|
|
|
561 |
st.write("Check out the paper for more detailed analysis!")
|
562 |
|
563 |
with tabs[-1]:
|
564 |
+
st.markdown(
|
565 |
+
"""**Motivation**:
|
566 |
+
|
567 |
+
Good LLM evaluations are [really hard](https://www.jasonwei.net/blog/evals), and newly released models often make their own claims about being the best at something, often citing its position on a benchmark or a leaderboard. But what if we let the models themselves decide who's the best?
|
568 |
|
569 |
+
**Main collaborators**:
|
570 |
+
- [Justin Zhao](https://x.com/justinxzhao)
|
571 |
+
- [Flor Plaza](https://x.com/florplaza22)
|
572 |
+
- [Sam Paech](https://x.com/sam_paech)
|
573 |
+
- [Federico Bianchi](https://x.com/federicobianchy)
|
574 |
+
- [Sahand Sabour](https://x.com/SahandSabour)
|
575 |
+
- [Amanda Cercas Curry](https://x.com/CurriedAmanda)
|
576 |
"""
|
577 |
)
|
578 |
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
streamlit
|
|
|
|
1 |
+
streamlit
|
2 |
+
plotly
|