justinxzhao commited on
Commit
7ee6d4e
β€’
1 Parent(s): a056e0b

Add icon, and reorganize data samples.

Browse files
Files changed (3) hide show
  1. app.py +73 -32
  2. data/leaderboard_6_11.csv +22 -0
  3. img/lmc_icon.png +0 -0
app.py CHANGED
@@ -1,5 +1,8 @@
1
  import streamlit as st
2
  import pandas as pd
 
 
 
3
 
4
  # Define constants
5
  MAJOR_A_WIN = "A>>B"
@@ -39,10 +42,22 @@ def is_consistent(rating, reverse_rating):
39
  return False
40
 
41
 
 
 
 
 
 
 
 
 
42
  # Load your dataframes
43
  df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
44
  df_responses = pd.read_json("data/responses.jsonl", lines=True)
45
  df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
 
 
 
 
46
 
47
  # Prepare the scenario selector options
48
  df_test_set["scenario_option"] = (
@@ -88,7 +103,7 @@ with col3:
88
  # Custom CSS to center title and header
89
  center_css = """
90
  <style>
91
- h1, h2, h3, h4, h5, h6 {
92
  text-align: center;
93
  }
94
  </style>
@@ -96,22 +111,38 @@ h1, h2, h3, h4, h5, h6 {
96
 
97
  st.markdown(center_css, unsafe_allow_html=True)
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  st.title("Language Model Council")
100
- st.subheader("Applied to emotional intelligence")
101
 
102
  # Create horizontal tabs
103
  tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
104
 
105
  # Define content for each tab
106
  with tabs[0]:
107
- st.write("This is the leaderboard results page.")
108
- # Add your leaderboard results content here
109
- leaderboard = {"Name": ["Alice", "Bob", "Charlie"], "Score": [95, 85, 75]}
110
- st.table(leaderboard)
111
 
112
  with tabs[1]:
 
113
  # Create the selectors
114
- selected_scenario = st.selectbox("Select Scenario", scenario_options)
 
 
115
 
116
  # Get the selected scenario details
117
  if selected_scenario:
@@ -130,12 +161,16 @@ with tabs[1]:
130
 
131
  st.divider()
132
 
 
 
133
  # Create two columns for model selectors
134
  col1, col2 = st.columns(2)
135
 
136
  with col1:
137
  fixed_model = "qwen1.5-32B-Chat"
138
- st.selectbox("Select Model", [fixed_model], key="fixed_model")
 
 
139
 
140
  # Get the response string for the fixed model
141
  if selected_scenario:
@@ -164,8 +199,35 @@ with tabs[1]:
164
 
165
  st.divider()
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  # Create the llm_judge selector
168
- selected_judge = st.selectbox("Select Judge", judge_options)
 
 
 
 
169
 
170
  # Get the judging details for the selected judge and models
171
  if selected_judge and selected_scenario:
@@ -193,7 +255,7 @@ with tabs[1]:
193
 
194
  # Display the judging details
195
  with col1:
196
- st.write(f"**{fixed_model}** vs **{selected_model}**")
197
  if not judging_details_left.empty:
198
  st.write(
199
  f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
@@ -203,7 +265,7 @@ with tabs[1]:
203
  st.write("No judging details found for the selected combination.")
204
 
205
  with col2:
206
- st.write(f"**{selected_model}** vs **{fixed_model}**")
207
  if not judging_details_right.empty:
208
  st.write(
209
  f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
@@ -212,27 +274,6 @@ with tabs[1]:
212
  else:
213
  st.write("No judging details found for the selected combination.")
214
 
215
- st.divider()
216
-
217
- # Add bar charts for value counts of pairwise choices over all judges
218
- col1, col2 = st.columns(2)
219
-
220
- with col1:
221
- pairwise_counts_left = df_response_judging[
222
- (df_response_judging["first_completion_by"] == fixed_model)
223
- & (df_response_judging["second_completion_by"] == selected_model)
224
- ]["pairwise_choice"].value_counts()
225
-
226
- st.bar_chart(pairwise_counts_left)
227
-
228
- with col2:
229
- pairwise_counts_right = df_response_judging[
230
- (df_response_judging["first_completion_by"] == selected_model)
231
- & (df_response_judging["second_completion_by"] == fixed_model)
232
- ]["pairwise_choice"].value_counts()
233
-
234
- st.bar_chart(pairwise_counts_right)
235
-
236
  with tabs[2]:
237
  st.write("This is the about us page.")
238
  # Add your about us content here
 
1
  import streamlit as st
2
  import pandas as pd
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
 
7
  # Define constants
8
  MAJOR_A_WIN = "A>>B"
 
42
  return False
43
 
44
 
45
+ # Function to convert PIL image to base64
46
+ def pil_to_base64(img):
47
+ buffered = BytesIO()
48
+ img.save(buffered, format="PNG")
49
+ img_str = base64.b64encode(buffered.getvalue()).decode()
50
+ return img_str
51
+
52
+
53
  # Load your dataframes
54
  df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
55
  df_responses = pd.read_json("data/responses.jsonl", lines=True)
56
  df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
57
+ df_leaderboard = (
58
+ pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
59
+ )
60
+ df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"})
61
 
62
  # Prepare the scenario selector options
63
  df_test_set["scenario_option"] = (
 
103
  # Custom CSS to center title and header
104
  center_css = """
105
  <style>
106
+ h1, h2{
107
  text-align: center;
108
  }
109
  </style>
 
111
 
112
  st.markdown(center_css, unsafe_allow_html=True)
113
 
114
+ # Load an image
115
+ image = Image.open("img/lmc_icon.png")
116
+
117
+ # Convert the image to base64
118
+ img_base64 = pil_to_base64(image)
119
+
120
+ # HTML to center the image and embed base64 image
121
+ centered_image_html = f"""
122
+ <div style="text-align: center;">
123
+ <img src="data:image/png;base64,{img_base64}" width="50"/>
124
+ </div>
125
+ """
126
+
127
+ # Rendering the centered image
128
+ st.markdown(centered_image_html, unsafe_allow_html=True)
129
+
130
  st.title("Language Model Council")
131
+ st.subheader("Benchmarking Foundation Models on Highly Subjective Tasks by Consensus")
132
 
133
  # Create horizontal tabs
134
  tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
135
 
136
  # Define content for each tab
137
  with tabs[0]:
138
+ st.dataframe(df_leaderboard)
 
 
 
139
 
140
  with tabs[1]:
141
+ st.markdown("### 1. Select a scenario.")
142
  # Create the selectors
143
+ selected_scenario = st.selectbox(
144
+ "Select Scenario", scenario_options, label_visibility="hidden"
145
+ )
146
 
147
  # Get the selected scenario details
148
  if selected_scenario:
 
161
 
162
  st.divider()
163
 
164
+ st.markdown("### 2. View responses.")
165
+
166
  # Create two columns for model selectors
167
  col1, col2 = st.columns(2)
168
 
169
  with col1:
170
  fixed_model = "qwen1.5-32B-Chat"
171
+ st.selectbox(
172
+ "Select Model", [fixed_model], key="fixed_model", label_visibility="hidden"
173
+ )
174
 
175
  # Get the response string for the fixed model
176
  if selected_scenario:
 
199
 
200
  st.divider()
201
 
202
+ # Add bar charts for value counts of pairwise choices over all judges
203
+ st.markdown("### 3. Response judging")
204
+
205
+ st.markdown("#### All council members")
206
+ col1, col2 = st.columns(2)
207
+
208
+ with col1:
209
+ st.write(f"**{fixed_model}** vs **{selected_model}**")
210
+ pairwise_counts_left = df_response_judging[
211
+ (df_response_judging["first_completion_by"] == fixed_model)
212
+ & (df_response_judging["second_completion_by"] == selected_model)
213
+ ]["pairwise_choice"].value_counts()
214
+ st.bar_chart(pairwise_counts_left)
215
+
216
+ with col2:
217
+ st.write(f"**{selected_model}** vs **{fixed_model}**")
218
+ pairwise_counts_right = df_response_judging[
219
+ (df_response_judging["first_completion_by"] == selected_model)
220
+ & (df_response_judging["second_completion_by"] == fixed_model)
221
+ ]["pairwise_choice"].value_counts()
222
+
223
+ st.bar_chart(pairwise_counts_right)
224
+
225
  # Create the llm_judge selector
226
+ # st.write("**Select an individual judge for detailed inpsection.**")
227
+ st.markdown("#### Individudal LLM judges")
228
+ selected_judge = st.selectbox(
229
+ "Select Judge", judge_options, label_visibility="hidden"
230
+ )
231
 
232
  # Get the judging details for the selected judge and models
233
  if selected_judge and selected_scenario:
 
255
 
256
  # Display the judging details
257
  with col1:
258
+ # st.write(f"**{fixed_model}** vs **{selected_model}**")
259
  if not judging_details_left.empty:
260
  st.write(
261
  f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
 
265
  st.write("No judging details found for the selected combination.")
266
 
267
  with col2:
268
+ # st.write(f"**{selected_model}** vs **{fixed_model}**")
269
  if not judging_details_right.empty:
270
  st.write(
271
  f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
 
274
  else:
275
  st.write("No judging details found for the selected combination.")
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  with tabs[2]:
278
  st.write("This is the about us page.")
279
  # Add your about us content here
data/leaderboard_6_11.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,LLM,Organization,EI Score,Release date (MM/YY),Chatbot Arena Elo,"MMLU
2
+ (5-shot)",Size,License
3
+ 2,gpt-4o-2024-05-13,Open AI,"59.2 (-1.2, 1.7)",05/24,1287,88.7,πŸ”’,Proprietary
4
+ 3,gpt-4-turbo-04-09,Open AI,"57.5 (-1.2, 1.7)",04/24,1256,πŸ”’,πŸ”’,Proprietary
5
+ 18,gpt-4-0613,Open AI,"26.9 (-1.4, 1.4)",06/23,1246,86.4,πŸ”’,Proprietary
6
+ 19,gpt-3.5-turbo-0125,Open AI,"18.2 (-1.1, 1.1)",01/24,1102,70.0,πŸ”’,Proprietary
7
+ 15,mistral-large-latest,Mistral,"33.9 (-1.5, 1.3)",02/24,1156,81.2,πŸ”’,Proprietary
8
+ 17,open-mixtral-8x22b,Mistral,"29.3 (-1.6, 1.5)",04/24,1146,77.8,176 B,Apache 2.0
9
+ 14,open-mixtral-8x7b,Mistral,"34.4 (-1.4, 1.5)",12/23,1114,70.6,56 B,Apache 2.0
10
+ 8,llama-3-70b-chat-hf,Meta,"45.1 (-1.5, 1.4)",04/24,1208,82.0,70 B,Llama 3 Community
11
+ 16,llama-3-8b-chat-hf,Meta,"30.0 (-1.4, 1.4)",04/24,1153,68.4,8 B,Llama 3 Community
12
+ 20,gemini-1.5-pro-preview-0409,Google,"11.6 (-0.9, 0.8)",05/24,1268,81.9,πŸ”’,Proprietary
13
+ 4,gemini-1.0-pro,Google,"50.6 (-1.2, 1.5)",04/24,1208,71.8,πŸ”’,Proprietary
14
+ 10,dbrx,Databricks,"38.8 (-1.5, 1.9)",03/24,1103,73.7,132 B,DBRX LICENSE
15
+ 12,command-r-plus,Cohere,"35.6 (-1.7, 1.7)",04/24,1189,75.7,104 B,CC-BY-NC-4.0
16
+ 13,command-r,Cohere,"34.7 (-1.7, 1.5)",04/24,1147,68.2,35 B,CC-BY-NC-4.0
17
+ 5,claude-3-opus-20240229,Anthropic,"50.1 (-1.5, 1.4)",03/24,1248,86.8,πŸ”’,Proprietary
18
+ 9,claude-3-sonnet-20240229,Anthropic,"42.5 (-1.5, 1.6)",03/24,1201,79.0,πŸ”’,Proprietary
19
+ 11,claude-3-haiku-20240307,Anthropic,"38.6 (-1.7, 2.2)",03/24,1178,75.2,πŸ”’,Proprietary
20
+ 1,qwen1.5-110B-chat,Alibaba,"65.6 (-1.2, 1.8)",02/24,1164,80.2,100 B,Qianwen LICENSE
21
+ 7,qwen1.5-72B-chat,Alibaba,"48.7 (-1.4, 1.6)",02/24,1152,77.4,72 B,Qianwen LICENSE
22
+ 6,qwen1.5-32B-chat,Alibaba,"50.0 (0.0, 0.0)",02/24,1126,74.3,32 B,Qianwen LICENSE
img/lmc_icon.png ADDED