Spaces:

llm-council
/

emotional-intelligence-arena

Runtime error

App Files Files Community

justinxzhao commited on Jun 12

Commit

29e2769

•

1 Parent(s): bfcc00c

Add index.html shim, and add hero.svg

Browse files

Files changed (4) hide show

app.py +150 -56
img/hero.png +0 -0
img/hero.svg +0 -0
index.html +7 -0

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 from PIL import Image
 import base64
 from io import BytesIO
 # Define constants
 MAJOR_A_WIN = "A>>B"
@@ -50,6 +51,14 @@ def pil_to_base64(img):
     return img_str
 # Load your dataframes
 df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
 df_responses = pd.read_json("data/responses.jsonl", lines=True)
@@ -57,7 +66,9 @@ df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
 df_leaderboard = (
     pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
 )
-df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"})
 # Prepare the scenario selector options
 df_test_set["scenario_option"] = (
@@ -84,7 +95,6 @@ div.stButton > button {
 }
 </style>
 """
 st.markdown(full_width_button_css, unsafe_allow_html=True)
 # Create a button that triggers the JavaScript function
@@ -104,8 +114,11 @@ with col2:
         st.write("Button 2 clicked")
 with col3:
-    if st.button("Github"):
-        st.write("Button 3 clicked")
 # Custom CSS to center title and header
 center_css = """
@@ -118,35 +131,48 @@ h1, h2, h6{
 st.markdown(center_css, unsafe_allow_html=True)
-# Load an image
-image = Image.open("img/lmc_icon.png")
-# Convert the image to base64
-img_base64 = pil_to_base64(image)
-# HTML to center the image and embed base64 image
-centered_image_html = f"""
-<div style="text-align: center;">
-    <img src="data:image/png;base64,{img_base64}" width="50"/>
-</div>
-"""
-# Rendering the centered image
-st.markdown(centered_image_html, unsafe_allow_html=True)
 st.title("Language Model Council")
 st.markdown(
-    "###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus"
 )
-with st.expander("Abstract (abridged)"):
-    st.markdown(
-        """Many tasks such as those related to emotional intelligence, creative writing, or persuasiveness, are highly subjective and often lack majoritarian agreement. To address the challenge of ranking LLMs on highly subjective tasks, we propose a novel benchmarking framework, the **Language Model Council (LMC)**. The LMC operates through a democratic process to:
-1. Formulate a test set through equal participation.
-2. Administer the test among council members.
-3. Evaluate responses as a collective jury.
-    """
     )
 st.markdown(
     "This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**."
@@ -175,22 +201,66 @@ def colored_text_box(text, background_color, text_color="black"):
     return html_code
 with tabs[1]:
     st.markdown("### 1. Select a scenario.")
     # Create the selectors
-    selected_scenario = st.selectbox(
-        "Select Scenario", scenario_options, label_visibility="hidden"
     )
     # Get the selected scenario details
-    if selected_scenario:
-        selected_emobench_id = int(selected_scenario.split(": ")[0])
         scenario_details = df_test_set[
             df_test_set["emobench_id"] == selected_emobench_id
         ].iloc[0]
         # Display the detailed dilemma and additional information
-        # st.write(scenario_details["detailed_dilemma"])
         st.markdown(
             colored_text_box(
                 scenario_details["detailed_dilemma"], "#eeeeeeff", "black"
@@ -217,14 +287,13 @@ with tabs[1]:
         )
         # Get the response string for the fixed model
-        if selected_scenario:
             response_details_fixed = df_responses[
                 (df_responses["emobench_id"] == selected_emobench_id)
                 & (df_responses["llm_responder"] == fixed_model)
             ].iloc[0]
             # Display the response string
-            # st.write(response_details_fixed["response_string"])
             st.markdown(
                 colored_text_box(
                     response_details_fixed["response_string"], "#eeeeeeff", "black"
@@ -233,19 +302,26 @@ with tabs[1]:
             )
     with col2:
-        selected_model = st.selectbox(
-            "Select Model", model_options, key="dynamic_model"
         )
         # Get the response string for the selected model
-        if selected_model and selected_scenario:
             response_details_dynamic = df_responses[
                 (df_responses["emobench_id"] == selected_emobench_id)
-                & (df_responses["llm_responder"] == selected_model)
             ].iloc[0]
             # Display the response string
-            # st.write(response_details_dynamic["response_string"])
             st.markdown(
                 colored_text_box(
                     response_details_dynamic["response_string"], "#eeeeeeff", "black"
@@ -262,43 +338,65 @@ with tabs[1]:
     col1, col2 = st.columns(2)
     with col1:
-        st.write(f"**{fixed_model}** vs **{selected_model}**")
         pairwise_counts_left = df_response_judging[
             (df_response_judging["first_completion_by"] == fixed_model)
-            & (df_response_judging["second_completion_by"] == selected_model)
         ]["pairwise_choice"].value_counts()
         st.bar_chart(pairwise_counts_left)
     with col2:
-        st.write(f"**{selected_model}** vs **{fixed_model}**")
         pairwise_counts_right = df_response_judging[
-            (df_response_judging["first_completion_by"] == selected_model)
             & (df_response_judging["second_completion_by"] == fixed_model)
         ]["pairwise_choice"].value_counts()
         st.bar_chart(pairwise_counts_right)
     # Create the llm_judge selector
-    st.markdown("#### Individudal LLM judges")
-    selected_judge = st.selectbox(
-        "Select Judge", judge_options, label_visibility="hidden"
     )
     # Get the judging details for the selected judge and models
-    if selected_judge and selected_scenario:
         col1, col2 = st.columns(2)
         judging_details_left = df_response_judging[
-            (df_response_judging["llm_judge"] == selected_judge)
             & (df_response_judging["first_completion_by"] == fixed_model)
-            & (df_response_judging["second_completion_by"] == selected_model)
         ].iloc[0]
         judging_details_right = df_response_judging[
-            (df_response_judging["llm_judge"] == selected_judge)
-            & (df_response_judging["first_completion_by"] == selected_model)
             & (df_response_judging["second_completion_by"] == fixed_model)
         ].iloc[0]
         if is_consistent(
             judging_details_left["pairwise_choice"],
             judging_details_right["pairwise_choice"],
@@ -309,12 +407,10 @@ with tabs[1]:
         # Display the judging details
         with col1:
-            # st.write(f"**{fixed_model}** vs **{selected_model}**")
             if not judging_details_left.empty:
                 st.write(
                     f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
                 )
-                # st.code(judging_details_left["judging_response_string"])
                 st.markdown(
                     colored_text_box(
                         judging_details_left["judging_response_string"],
@@ -327,12 +423,10 @@ with tabs[1]:
                 st.write("No judging details found for the selected combination.")
         with col2:
-            # st.write(f"**{selected_model}** vs **{fixed_model}**")
             if not judging_details_right.empty:
                 st.write(
                     f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
                 )
-                # st.code(judging_details_right["judging_response_string"])
                 st.markdown(
                     colored_text_box(
                         judging_details_right["judging_response_string"],

 from PIL import Image
 import base64
 from io import BytesIO
+import random
 # Define constants
 MAJOR_A_WIN = "A>>B"
     return img_str
+# Function to convert PIL image to base64
+def pil_svg_to_base64(img):
+    buffered = BytesIO()
+    img.save(buffered, format="SVG")
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    return img_str
 # Load your dataframes
 df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
 df_responses = pd.read_json("data/responses.jsonl", lines=True)
 df_leaderboard = (
     pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
 )
+df_leaderboard = df_leaderboard.rename(
+    columns={"EI Score": "Council Arena EI Score (95% CI)"}
+)
 # Prepare the scenario selector options
 df_test_set["scenario_option"] = (
 }
 </style>
 """
 st.markdown(full_width_button_css, unsafe_allow_html=True)
 # Create a button that triggers the JavaScript function
         st.write("Button 2 clicked")
 with col3:
+    st.link_button(
+        "Github",
+        "https://github.com/llm-council/llm-council",
+        use_container_width=True,
+    )
 # Custom CSS to center title and header
 center_css = """
 st.markdown(center_css, unsafe_allow_html=True)
+# Centered icon.
+# image = Image.open("img/lmc_icon.png")
+# img_base64 = pil_to_base64(image)
+# centered_image_html = f"""
+# <div style="text-align: center;">
+#     <img src="data:image/png;base64,{img_base64}" width="50"/>
+# </div>
+# """
+# st.markdown(centered_image_html, unsafe_allow_html=True)
+# Title and subtitle.
 st.title("Language Model Council")
 st.markdown(
+    "###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:"
 )
+# Render hero image.
+with open("img/hero.svg", "r") as file:
+    svg_content = file.read()
+left_co, cent_co, last_co = st.columns([0.2, 0.6, 0.2])
+with cent_co:
+    st.image(svg_content, use_column_width=True)
+with cent_co.expander("Abstract"):
+    st.markdown(
+        """The rapid advancement of Large Language Models (LLMs) necessitates robust
+and challenging benchmarks. Leaderboards like Chatbot Arena rank LLMs based
+on how well their responses align with human preferences. However, many tasks
+such as those related to emotional intelligence, creative writing, or persuasiveness,
+are highly subjective and often lack majoritarian human agreement. Judges may
+have irreconcilable disagreements about what constitutes a better response. To
+address the challenge of ranking LLMs on highly subjective tasks, we propose
+a novel benchmarking framework, the Language Model Council (LMC). The
+LMC operates through a democratic process to: 1) formulate a test set through
+equal participation, 2) administer the test among council members, and 3) evaluate
+responses as a collective jury. We deploy a council of 20 newest LLMs on an
+open-ended emotional intelligence task: responding to interpersonal dilemmas.
+Our results show that the LMC produces rankings that are more separable, robust,
+and less biased than those from any individual LLM judge, and is more consistent
+with a human-established leaderboard compared to other benchmarks."""
     )
 st.markdown(
     "This leaderboard comes from deploying a Council of 20 LLMs on an **open-ended emotional intelligence task: responding to interpersonal dilemmas**."
     return html_code
+# Ensure to initialize session state variables if they do not exist
+if "selected_scenario" not in st.session_state:
+    st.session_state.selected_scenario = None
+if "selected_model" not in st.session_state:
+    st.session_state.selected_model = None
+if "selected_judge" not in st.session_state:
+    st.session_state.selected_judge = None
+# Define callback functions to update session state
+def update_scenario():
+    st.session_state.selected_scenario = st.session_state.scenario_selector
+def update_model():
+    st.session_state.selected_model = st.session_state.model_selector
+def update_judge():
+    st.session_state.selected_judge = st.session_state.judge_selector
+def randomize_selection():
+    st.session_state.selected_scenario = random.choice(scenario_options)
+    st.session_state.selected_model = random.choice(model_options)
+    st.session_state.selected_judge = random.choice(judge_options)
 with tabs[1]:
+    # Add randomize button at the top of the app
+    _, mid_column, _ = st.columns([0.4, 0.2, 0.4])
+    mid_column.button(
+        ":game_die: Randomize!", on_click=randomize_selection, type="primary"
+    )
     st.markdown("### 1. Select a scenario.")
     # Create the selectors
+    st.session_state.selected_scenario = st.selectbox(
+        "Select Scenario",
+        scenario_options,
+        label_visibility="hidden",
+        key="scenario_selector",
+        on_change=update_scenario,
+        index=(
+            scenario_options.index(st.session_state.selected_scenario)
+            if st.session_state.selected_scenario
+            else 0
+        ),
     )
     # Get the selected scenario details
+    if st.session_state.selected_scenario:
+        selected_emobench_id = int(st.session_state.selected_scenario.split(": ")[0])
         scenario_details = df_test_set[
             df_test_set["emobench_id"] == selected_emobench_id
         ].iloc[0]
         # Display the detailed dilemma and additional information
         st.markdown(
             colored_text_box(
                 scenario_details["detailed_dilemma"], "#eeeeeeff", "black"
         )
         # Get the response string for the fixed model
+        if st.session_state.selected_scenario:
             response_details_fixed = df_responses[
                 (df_responses["emobench_id"] == selected_emobench_id)
                 & (df_responses["llm_responder"] == fixed_model)
             ].iloc[0]
             # Display the response string
             st.markdown(
                 colored_text_box(
                     response_details_fixed["response_string"], "#eeeeeeff", "black"
             )
     with col2:
+        st.session_state.selected_model = st.selectbox(
+            "Select Model",
+            model_options,
+            key="model_selector",
+            on_change=update_model,
+            index=(
+                model_options.index(st.session_state.selected_model)
+                if st.session_state.selected_model
+                else 0
+            ),
         )
         # Get the response string for the selected model
+        if st.session_state.selected_model and st.session_state.selected_scenario:
             response_details_dynamic = df_responses[
                 (df_responses["emobench_id"] == selected_emobench_id)
+                & (df_responses["llm_responder"] == st.session_state.selected_model)
             ].iloc[0]
             # Display the response string
             st.markdown(
                 colored_text_box(
                     response_details_dynamic["response_string"], "#eeeeeeff", "black"
     col1, col2 = st.columns(2)
     with col1:
+        st.write(f"**{fixed_model}** vs **{st.session_state.selected_model}**")
         pairwise_counts_left = df_response_judging[
             (df_response_judging["first_completion_by"] == fixed_model)
+            & (
+                df_response_judging["second_completion_by"]
+                == st.session_state.selected_model
+            )
         ]["pairwise_choice"].value_counts()
         st.bar_chart(pairwise_counts_left)
     with col2:
+        st.write(f"**{st.session_state.selected_model}** vs **{fixed_model}**")
         pairwise_counts_right = df_response_judging[
+            (
+                df_response_judging["first_completion_by"]
+                == st.session_state.selected_model
+            )
             & (df_response_judging["second_completion_by"] == fixed_model)
         ]["pairwise_choice"].value_counts()
         st.bar_chart(pairwise_counts_right)
     # Create the llm_judge selector
+    st.markdown("#### Individual LLM judges")
+    st.session_state.selected_judge = st.selectbox(
+        "Select Judge",
+        judge_options,
+        label_visibility="hidden",
+        key="judge_selector",
+        on_change=update_judge,
+        index=(
+            judge_options.index(st.session_state.selected_judge)
+            if st.session_state.selected_judge
+            else 0
+        ),
     )
     # Get the judging details for the selected judge and models
+    if st.session_state.selected_judge and st.session_state.selected_scenario:
         col1, col2 = st.columns(2)
         judging_details_left = df_response_judging[
+            (df_response_judging["llm_judge"] == st.session_state.selected_judge)
             & (df_response_judging["first_completion_by"] == fixed_model)
+            & (
+                df_response_judging["second_completion_by"]
+                == st.session_state.selected_model
+            )
         ].iloc[0]
         judging_details_right = df_response_judging[
+            (df_response_judging["llm_judge"] == st.session_state.selected_judge)
+            & (
+                df_response_judging["first_completion_by"]
+                == st.session_state.selected_model
+            )
             & (df_response_judging["second_completion_by"] == fixed_model)
         ].iloc[0]
+        # Render consistency.
         if is_consistent(
             judging_details_left["pairwise_choice"],
             judging_details_right["pairwise_choice"],
         # Display the judging details
         with col1:
             if not judging_details_left.empty:
                 st.write(
                     f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
                 )
                 st.markdown(
                     colored_text_box(
                         judging_details_left["judging_response_string"],
                 st.write("No judging details found for the selected combination.")
         with col2:
             if not judging_details_right.empty:
                 st.write(
                     f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
                 )
                 st.markdown(
                     colored_text_box(
                         judging_details_right["judging_response_string"],

img/hero.png ADDED Viewed

img/hero.svg ADDED Viewed

index.html ADDED Viewed

	@@ -0,0 +1,7 @@

+<iframe
+    id="your-iframe-id"
+    src="https://llm-council-emotional-intelligence-arena.hf.space"
+    frameborder="0"
+    width="100%"
+    height="100%"
+></iframe>