Spaces:

llm-council
/

emotional-intelligence-arena

Running

App Files Files Community

justinxzhao commited on Jun 11

Commit

7ee6d4e

•

1 Parent(s): a056e0b

Add icon, and reorganize data samples.

Browse files

Files changed (3) hide show

app.py +73 -32
data/leaderboard_6_11.csv +22 -0
img/lmc_icon.png +0 -0

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import streamlit as st
 import pandas as pd
 # Define constants
 MAJOR_A_WIN = "A>>B"
@@ -39,10 +42,22 @@ def is_consistent(rating, reverse_rating):
     return False
 # Load your dataframes
 df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
 df_responses = pd.read_json("data/responses.jsonl", lines=True)
 df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
 # Prepare the scenario selector options
 df_test_set["scenario_option"] = (
@@ -88,7 +103,7 @@ with col3:
 # Custom CSS to center title and header
 center_css = """
 <style>
-h1, h2, h3, h4, h5, h6 {
     text-align: center;
 }
 </style>
@@ -96,22 +111,38 @@ h1, h2, h3, h4, h5, h6 {
 st.markdown(center_css, unsafe_allow_html=True)
 st.title("Language Model Council")
-st.subheader("Applied to emotional intelligence")
 # Create horizontal tabs
 tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
 # Define content for each tab
 with tabs[0]:
-    st.write("This is the leaderboard results page.")
-    # Add your leaderboard results content here
-    leaderboard = {"Name": ["Alice", "Bob", "Charlie"], "Score": [95, 85, 75]}
-    st.table(leaderboard)
 with tabs[1]:
     # Create the selectors
-    selected_scenario = st.selectbox("Select Scenario", scenario_options)
     # Get the selected scenario details
     if selected_scenario:
@@ -130,12 +161,16 @@ with tabs[1]:
     st.divider()
     # Create two columns for model selectors
     col1, col2 = st.columns(2)
     with col1:
         fixed_model = "qwen1.5-32B-Chat"
-        st.selectbox("Select Model", [fixed_model], key="fixed_model")
         # Get the response string for the fixed model
         if selected_scenario:
@@ -164,8 +199,35 @@ with tabs[1]:
     st.divider()
     # Create the llm_judge selector
-    selected_judge = st.selectbox("Select Judge", judge_options)
     # Get the judging details for the selected judge and models
     if selected_judge and selected_scenario:
@@ -193,7 +255,7 @@ with tabs[1]:
         # Display the judging details
         with col1:
-            st.write(f"**{fixed_model}** vs **{selected_model}**")
             if not judging_details_left.empty:
                 st.write(
                     f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
@@ -203,7 +265,7 @@ with tabs[1]:
                 st.write("No judging details found for the selected combination.")
         with col2:
-            st.write(f"**{selected_model}** vs **{fixed_model}**")
             if not judging_details_right.empty:
                 st.write(
                     f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
@@ -212,27 +274,6 @@ with tabs[1]:
             else:
                 st.write("No judging details found for the selected combination.")
-    st.divider()
-    # Add bar charts for value counts of pairwise choices over all judges
-    col1, col2 = st.columns(2)
-    with col1:
-        pairwise_counts_left = df_response_judging[
-            (df_response_judging["first_completion_by"] == fixed_model)
-            & (df_response_judging["second_completion_by"] == selected_model)
-        ]["pairwise_choice"].value_counts()
-        st.bar_chart(pairwise_counts_left)
-    with col2:
-        pairwise_counts_right = df_response_judging[
-            (df_response_judging["first_completion_by"] == selected_model)
-            & (df_response_judging["second_completion_by"] == fixed_model)
-        ]["pairwise_choice"].value_counts()
-        st.bar_chart(pairwise_counts_right)
 with tabs[2]:
     st.write("This is the about us page.")
     # Add your about us content here

 import streamlit as st
 import pandas as pd
+from PIL import Image
+import base64
+from io import BytesIO
 # Define constants
 MAJOR_A_WIN = "A>>B"
     return False
+# Function to convert PIL image to base64
+def pil_to_base64(img):
+    buffered = BytesIO()
+    img.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    return img_str
 # Load your dataframes
 df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
 df_responses = pd.read_json("data/responses.jsonl", lines=True)
 df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
+df_leaderboard = (
+    pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
+)
+df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"})
 # Prepare the scenario selector options
 df_test_set["scenario_option"] = (
 # Custom CSS to center title and header
 center_css = """
 <style>
+h1, h2{
     text-align: center;
 }
 </style>
 st.markdown(center_css, unsafe_allow_html=True)
+# Load an image
+image = Image.open("img/lmc_icon.png")
+# Convert the image to base64
+img_base64 = pil_to_base64(image)
+# HTML to center the image and embed base64 image
+centered_image_html = f"""
+<div style="text-align: center;">
+    <img src="data:image/png;base64,{img_base64}" width="50"/>
+</div>
+"""
+# Rendering the centered image
+st.markdown(centered_image_html, unsafe_allow_html=True)
 st.title("Language Model Council")
+st.subheader("Benchmarking Foundation Models on Highly Subjective Tasks by Consensus")
 # Create horizontal tabs
 tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])
 # Define content for each tab
 with tabs[0]:
+    st.dataframe(df_leaderboard)
 with tabs[1]:
+    st.markdown("### 1. Select a scenario.")
     # Create the selectors
+    selected_scenario = st.selectbox(
+        "Select Scenario", scenario_options, label_visibility="hidden"
+    )
     # Get the selected scenario details
     if selected_scenario:
     st.divider()
+    st.markdown("### 2. View responses.")
     # Create two columns for model selectors
     col1, col2 = st.columns(2)
     with col1:
         fixed_model = "qwen1.5-32B-Chat"
+        st.selectbox(
+            "Select Model", [fixed_model], key="fixed_model", label_visibility="hidden"
+        )
         # Get the response string for the fixed model
         if selected_scenario:
     st.divider()
+    # Add bar charts for value counts of pairwise choices over all judges
+    st.markdown("### 3. Response judging")
+    st.markdown("#### All council members")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write(f"**{fixed_model}** vs **{selected_model}**")
+        pairwise_counts_left = df_response_judging[
+            (df_response_judging["first_completion_by"] == fixed_model)
+            & (df_response_judging["second_completion_by"] == selected_model)
+        ]["pairwise_choice"].value_counts()
+        st.bar_chart(pairwise_counts_left)
+    with col2:
+        st.write(f"**{selected_model}** vs **{fixed_model}**")
+        pairwise_counts_right = df_response_judging[
+            (df_response_judging["first_completion_by"] == selected_model)
+            & (df_response_judging["second_completion_by"] == fixed_model)
+        ]["pairwise_choice"].value_counts()
+        st.bar_chart(pairwise_counts_right)
     # Create the llm_judge selector
+    # st.write("**Select an individual judge for detailed inpsection.**")
+    st.markdown("#### Individudal LLM judges")
+    selected_judge = st.selectbox(
+        "Select Judge", judge_options, label_visibility="hidden"
+    )
     # Get the judging details for the selected judge and models
     if selected_judge and selected_scenario:
         # Display the judging details
         with col1:
+            # st.write(f"**{fixed_model}** vs **{selected_model}**")
             if not judging_details_left.empty:
                 st.write(
                     f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
                 st.write("No judging details found for the selected combination.")
         with col2:
+            # st.write(f"**{selected_model}** vs **{fixed_model}**")
             if not judging_details_right.empty:
                 st.write(
                     f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
             else:
                 st.write("No judging details found for the selected combination.")
 with tabs[2]:
     st.write("This is the about us page.")
     # Add your about us content here

data/leaderboard_6_11.csv ADDED Viewed

	@@ -0,0 +1,22 @@

+Rank,LLM,Organization,EI Score,Release date (MM/YY),Chatbot Arena Elo,"MMLU
+(5-shot)",Size,License
+2,gpt-4o-2024-05-13,Open AI,"59.2 (-1.2, 1.7)",05/24,1287,88.7,🔒,Proprietary
+3,gpt-4-turbo-04-09,Open AI,"57.5 (-1.2, 1.7)",04/24,1256,🔒,🔒,Proprietary
+18,gpt-4-0613,Open AI,"26.9 (-1.4, 1.4)",06/23,1246,86.4,🔒,Proprietary
+19,gpt-3.5-turbo-0125,Open AI,"18.2 (-1.1, 1.1)",01/24,1102,70.0,🔒,Proprietary
+15,mistral-large-latest,Mistral,"33.9 (-1.5, 1.3)",02/24,1156,81.2,🔒,Proprietary
+17,open-mixtral-8x22b,Mistral,"29.3 (-1.6, 1.5)",04/24,1146,77.8,176 B,Apache 2.0
+14,open-mixtral-8x7b,Mistral,"34.4 (-1.4, 1.5)",12/23,1114,70.6,56 B,Apache 2.0
+8,llama-3-70b-chat-hf,Meta,"45.1 (-1.5, 1.4)",04/24,1208,82.0,70 B,Llama 3 Community
+16,llama-3-8b-chat-hf,Meta,"30.0 (-1.4, 1.4)",04/24,1153,68.4,8 B,Llama 3 Community
+20,gemini-1.5-pro-preview-0409,Google,"11.6 (-0.9, 0.8)",05/24,1268,81.9,🔒,Proprietary
+4,gemini-1.0-pro,Google,"50.6 (-1.2, 1.5)",04/24,1208,71.8,🔒,Proprietary
+10,dbrx,Databricks,"38.8 (-1.5, 1.9)",03/24,1103,73.7,132 B,DBRX LICENSE
+12,command-r-plus,Cohere,"35.6 (-1.7, 1.7)",04/24,1189,75.7,104 B,CC-BY-NC-4.0
+13,command-r,Cohere,"34.7 (-1.7, 1.5)",04/24,1147,68.2,35 B,CC-BY-NC-4.0
+5,claude-3-opus-20240229,Anthropic,"50.1 (-1.5, 1.4)",03/24,1248,86.8,🔒,Proprietary
+9,claude-3-sonnet-20240229,Anthropic,"42.5 (-1.5, 1.6)",03/24,1201,79.0,🔒,Proprietary
+11,claude-3-haiku-20240307,Anthropic,"38.6 (-1.7, 2.2)",03/24,1178,75.2,🔒,Proprietary
+1,qwen1.5-110B-chat,Alibaba,"65.6 (-1.2, 1.8)",02/24,1164,80.2,100 B,Qianwen LICENSE
+7,qwen1.5-72B-chat,Alibaba,"48.7 (-1.4, 1.6)",02/24,1152,77.4,72 B,Qianwen LICENSE
+6,qwen1.5-32B-chat,Alibaba,"50.0 (0.0, 0.0)",02/24,1126,74.3,32 B,Qianwen LICENSE

img/lmc_icon.png ADDED Viewed