justinxzhao
commited on
Commit
β’
7ee6d4e
1
Parent(s):
a056e0b
Add icon, and reorganize data samples.
Browse files- app.py +73 -32
- data/leaderboard_6_11.csv +22 -0
- img/lmc_icon.png +0 -0
app.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
|
|
|
|
3 |
|
4 |
# Define constants
|
5 |
MAJOR_A_WIN = "A>>B"
|
@@ -39,10 +42,22 @@ def is_consistent(rating, reverse_rating):
|
|
39 |
return False
|
40 |
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Load your dataframes
|
43 |
df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
|
44 |
df_responses = pd.read_json("data/responses.jsonl", lines=True)
|
45 |
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# Prepare the scenario selector options
|
48 |
df_test_set["scenario_option"] = (
|
@@ -88,7 +103,7 @@ with col3:
|
|
88 |
# Custom CSS to center title and header
|
89 |
center_css = """
|
90 |
<style>
|
91 |
-
h1, h2
|
92 |
text-align: center;
|
93 |
}
|
94 |
</style>
|
@@ -96,22 +111,38 @@ h1, h2, h3, h4, h5, h6 {
|
|
96 |
|
97 |
st.markdown(center_css, unsafe_allow_html=True)
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
st.title("Language Model Council")
|
100 |
-
st.subheader("
|
101 |
|
102 |
# Create horizontal tabs
|
103 |
tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
|
104 |
|
105 |
# Define content for each tab
|
106 |
with tabs[0]:
|
107 |
-
st.
|
108 |
-
# Add your leaderboard results content here
|
109 |
-
leaderboard = {"Name": ["Alice", "Bob", "Charlie"], "Score": [95, 85, 75]}
|
110 |
-
st.table(leaderboard)
|
111 |
|
112 |
with tabs[1]:
|
|
|
113 |
# Create the selectors
|
114 |
-
selected_scenario = st.selectbox(
|
|
|
|
|
115 |
|
116 |
# Get the selected scenario details
|
117 |
if selected_scenario:
|
@@ -130,12 +161,16 @@ with tabs[1]:
|
|
130 |
|
131 |
st.divider()
|
132 |
|
|
|
|
|
133 |
# Create two columns for model selectors
|
134 |
col1, col2 = st.columns(2)
|
135 |
|
136 |
with col1:
|
137 |
fixed_model = "qwen1.5-32B-Chat"
|
138 |
-
st.selectbox(
|
|
|
|
|
139 |
|
140 |
# Get the response string for the fixed model
|
141 |
if selected_scenario:
|
@@ -164,8 +199,35 @@ with tabs[1]:
|
|
164 |
|
165 |
st.divider()
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
# Create the llm_judge selector
|
168 |
-
|
|
|
|
|
|
|
|
|
169 |
|
170 |
# Get the judging details for the selected judge and models
|
171 |
if selected_judge and selected_scenario:
|
@@ -193,7 +255,7 @@ with tabs[1]:
|
|
193 |
|
194 |
# Display the judging details
|
195 |
with col1:
|
196 |
-
st.write(f"**{fixed_model}** vs **{selected_model}**")
|
197 |
if not judging_details_left.empty:
|
198 |
st.write(
|
199 |
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
|
@@ -203,7 +265,7 @@ with tabs[1]:
|
|
203 |
st.write("No judging details found for the selected combination.")
|
204 |
|
205 |
with col2:
|
206 |
-
st.write(f"**{selected_model}** vs **{fixed_model}**")
|
207 |
if not judging_details_right.empty:
|
208 |
st.write(
|
209 |
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
|
@@ -212,27 +274,6 @@ with tabs[1]:
|
|
212 |
else:
|
213 |
st.write("No judging details found for the selected combination.")
|
214 |
|
215 |
-
st.divider()
|
216 |
-
|
217 |
-
# Add bar charts for value counts of pairwise choices over all judges
|
218 |
-
col1, col2 = st.columns(2)
|
219 |
-
|
220 |
-
with col1:
|
221 |
-
pairwise_counts_left = df_response_judging[
|
222 |
-
(df_response_judging["first_completion_by"] == fixed_model)
|
223 |
-
& (df_response_judging["second_completion_by"] == selected_model)
|
224 |
-
]["pairwise_choice"].value_counts()
|
225 |
-
|
226 |
-
st.bar_chart(pairwise_counts_left)
|
227 |
-
|
228 |
-
with col2:
|
229 |
-
pairwise_counts_right = df_response_judging[
|
230 |
-
(df_response_judging["first_completion_by"] == selected_model)
|
231 |
-
& (df_response_judging["second_completion_by"] == fixed_model)
|
232 |
-
]["pairwise_choice"].value_counts()
|
233 |
-
|
234 |
-
st.bar_chart(pairwise_counts_right)
|
235 |
-
|
236 |
with tabs[2]:
|
237 |
st.write("This is the about us page.")
|
238 |
# Add your about us content here
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
from PIL import Image
|
4 |
+
import base64
|
5 |
+
from io import BytesIO
|
6 |
|
7 |
# Define constants
|
8 |
MAJOR_A_WIN = "A>>B"
|
|
|
42 |
return False
|
43 |
|
44 |
|
45 |
+
# Function to convert PIL image to base64
|
46 |
+
def pil_to_base64(img):
|
47 |
+
buffered = BytesIO()
|
48 |
+
img.save(buffered, format="PNG")
|
49 |
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
50 |
+
return img_str
|
51 |
+
|
52 |
+
|
53 |
# Load your dataframes
|
54 |
df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
|
55 |
df_responses = pd.read_json("data/responses.jsonl", lines=True)
|
56 |
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
|
57 |
+
df_leaderboard = (
|
58 |
+
pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
|
59 |
+
)
|
60 |
+
df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"})
|
61 |
|
62 |
# Prepare the scenario selector options
|
63 |
df_test_set["scenario_option"] = (
|
|
|
103 |
# Custom CSS to center title and header
|
104 |
center_css = """
|
105 |
<style>
|
106 |
+
h1, h2{
|
107 |
text-align: center;
|
108 |
}
|
109 |
</style>
|
|
|
111 |
|
112 |
st.markdown(center_css, unsafe_allow_html=True)
|
113 |
|
114 |
+
# Load an image
|
115 |
+
image = Image.open("img/lmc_icon.png")
|
116 |
+
|
117 |
+
# Convert the image to base64
|
118 |
+
img_base64 = pil_to_base64(image)
|
119 |
+
|
120 |
+
# HTML to center the image and embed base64 image
|
121 |
+
centered_image_html = f"""
|
122 |
+
<div style="text-align: center;">
|
123 |
+
<img src="data:image/png;base64,{img_base64}" width="50"/>
|
124 |
+
</div>
|
125 |
+
"""
|
126 |
+
|
127 |
+
# Rendering the centered image
|
128 |
+
st.markdown(centered_image_html, unsafe_allow_html=True)
|
129 |
+
|
130 |
st.title("Language Model Council")
|
131 |
+
st.subheader("Benchmarking Foundation Models on Highly Subjective Tasks by Consensus")
|
132 |
|
133 |
# Create horizontal tabs
|
134 |
tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
|
135 |
|
136 |
# Define content for each tab
|
137 |
with tabs[0]:
|
138 |
+
st.dataframe(df_leaderboard)
|
|
|
|
|
|
|
139 |
|
140 |
with tabs[1]:
|
141 |
+
st.markdown("### 1. Select a scenario.")
|
142 |
# Create the selectors
|
143 |
+
selected_scenario = st.selectbox(
|
144 |
+
"Select Scenario", scenario_options, label_visibility="hidden"
|
145 |
+
)
|
146 |
|
147 |
# Get the selected scenario details
|
148 |
if selected_scenario:
|
|
|
161 |
|
162 |
st.divider()
|
163 |
|
164 |
+
st.markdown("### 2. View responses.")
|
165 |
+
|
166 |
# Create two columns for model selectors
|
167 |
col1, col2 = st.columns(2)
|
168 |
|
169 |
with col1:
|
170 |
fixed_model = "qwen1.5-32B-Chat"
|
171 |
+
st.selectbox(
|
172 |
+
"Select Model", [fixed_model], key="fixed_model", label_visibility="hidden"
|
173 |
+
)
|
174 |
|
175 |
# Get the response string for the fixed model
|
176 |
if selected_scenario:
|
|
|
199 |
|
200 |
st.divider()
|
201 |
|
202 |
+
# Add bar charts for value counts of pairwise choices over all judges
|
203 |
+
st.markdown("### 3. Response judging")
|
204 |
+
|
205 |
+
st.markdown("#### All council members")
|
206 |
+
col1, col2 = st.columns(2)
|
207 |
+
|
208 |
+
with col1:
|
209 |
+
st.write(f"**{fixed_model}** vs **{selected_model}**")
|
210 |
+
pairwise_counts_left = df_response_judging[
|
211 |
+
(df_response_judging["first_completion_by"] == fixed_model)
|
212 |
+
& (df_response_judging["second_completion_by"] == selected_model)
|
213 |
+
]["pairwise_choice"].value_counts()
|
214 |
+
st.bar_chart(pairwise_counts_left)
|
215 |
+
|
216 |
+
with col2:
|
217 |
+
st.write(f"**{selected_model}** vs **{fixed_model}**")
|
218 |
+
pairwise_counts_right = df_response_judging[
|
219 |
+
(df_response_judging["first_completion_by"] == selected_model)
|
220 |
+
& (df_response_judging["second_completion_by"] == fixed_model)
|
221 |
+
]["pairwise_choice"].value_counts()
|
222 |
+
|
223 |
+
st.bar_chart(pairwise_counts_right)
|
224 |
+
|
225 |
# Create the llm_judge selector
|
226 |
+
# st.write("**Select an individual judge for detailed inpsection.**")
|
227 |
+
st.markdown("#### Individudal LLM judges")
|
228 |
+
selected_judge = st.selectbox(
|
229 |
+
"Select Judge", judge_options, label_visibility="hidden"
|
230 |
+
)
|
231 |
|
232 |
# Get the judging details for the selected judge and models
|
233 |
if selected_judge and selected_scenario:
|
|
|
255 |
|
256 |
# Display the judging details
|
257 |
with col1:
|
258 |
+
# st.write(f"**{fixed_model}** vs **{selected_model}**")
|
259 |
if not judging_details_left.empty:
|
260 |
st.write(
|
261 |
f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
|
|
|
265 |
st.write("No judging details found for the selected combination.")
|
266 |
|
267 |
with col2:
|
268 |
+
# st.write(f"**{selected_model}** vs **{fixed_model}**")
|
269 |
if not judging_details_right.empty:
|
270 |
st.write(
|
271 |
f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
|
|
|
274 |
else:
|
275 |
st.write("No judging details found for the selected combination.")
|
276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
with tabs[2]:
|
278 |
st.write("This is the about us page.")
|
279 |
# Add your about us content here
|
data/leaderboard_6_11.csv
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Rank,LLM,Organization,EI Score,Release date (MM/YY),Chatbot Arena Elo,"MMLU
|
2 |
+
(5-shot)",Size,License
|
3 |
+
2,gpt-4o-2024-05-13,Open AI,"59.2 (-1.2, 1.7)",05/24,1287,88.7,π,Proprietary
|
4 |
+
3,gpt-4-turbo-04-09,Open AI,"57.5 (-1.2, 1.7)",04/24,1256,π,π,Proprietary
|
5 |
+
18,gpt-4-0613,Open AI,"26.9 (-1.4, 1.4)",06/23,1246,86.4,π,Proprietary
|
6 |
+
19,gpt-3.5-turbo-0125,Open AI,"18.2 (-1.1, 1.1)",01/24,1102,70.0,π,Proprietary
|
7 |
+
15,mistral-large-latest,Mistral,"33.9 (-1.5, 1.3)",02/24,1156,81.2,π,Proprietary
|
8 |
+
17,open-mixtral-8x22b,Mistral,"29.3 (-1.6, 1.5)",04/24,1146,77.8,176 B,Apache 2.0
|
9 |
+
14,open-mixtral-8x7b,Mistral,"34.4 (-1.4, 1.5)",12/23,1114,70.6,56 B,Apache 2.0
|
10 |
+
8,llama-3-70b-chat-hf,Meta,"45.1 (-1.5, 1.4)",04/24,1208,82.0,70 B,Llama 3 Community
|
11 |
+
16,llama-3-8b-chat-hf,Meta,"30.0 (-1.4, 1.4)",04/24,1153,68.4,8 B,Llama 3 Community
|
12 |
+
20,gemini-1.5-pro-preview-0409,Google,"11.6 (-0.9, 0.8)",05/24,1268,81.9,π,Proprietary
|
13 |
+
4,gemini-1.0-pro,Google,"50.6 (-1.2, 1.5)",04/24,1208,71.8,π,Proprietary
|
14 |
+
10,dbrx,Databricks,"38.8 (-1.5, 1.9)",03/24,1103,73.7,132 B,DBRX LICENSE
|
15 |
+
12,command-r-plus,Cohere,"35.6 (-1.7, 1.7)",04/24,1189,75.7,104 B,CC-BY-NC-4.0
|
16 |
+
13,command-r,Cohere,"34.7 (-1.7, 1.5)",04/24,1147,68.2,35 B,CC-BY-NC-4.0
|
17 |
+
5,claude-3-opus-20240229,Anthropic,"50.1 (-1.5, 1.4)",03/24,1248,86.8,π,Proprietary
|
18 |
+
9,claude-3-sonnet-20240229,Anthropic,"42.5 (-1.5, 1.6)",03/24,1201,79.0,π,Proprietary
|
19 |
+
11,claude-3-haiku-20240307,Anthropic,"38.6 (-1.7, 2.2)",03/24,1178,75.2,π,Proprietary
|
20 |
+
1,qwen1.5-110B-chat,Alibaba,"65.6 (-1.2, 1.8)",02/24,1164,80.2,100 B,Qianwen LICENSE
|
21 |
+
7,qwen1.5-72B-chat,Alibaba,"48.7 (-1.4, 1.6)",02/24,1152,77.4,72 B,Qianwen LICENSE
|
22 |
+
6,qwen1.5-32B-chat,Alibaba,"50.0 (0.0, 0.0)",02/24,1126,74.3,32 B,Qianwen LICENSE
|
img/lmc_icon.png
ADDED