Spaces:

llm-council
/

emotional-intelligence-arena

Running

File size: 9,183 Bytes

import streamlit as st
import pandas as pd
from PIL import Image
import base64
from io import BytesIO

# Define constants
MAJOR_A_WIN = "A>>B"
MINOR_A_WIN = "A>B"
MINOR_B_WIN = "B>A"
MAJOR_B_WIN = "B>>A"
TIE = "A=B"


def is_consistent(rating, reverse_rating):
    if rating in {MAJOR_A_WIN, MINOR_A_WIN} and reverse_rating in {
        MAJOR_B_WIN,
        MINOR_B_WIN,
    }:
        return True
    if rating in {MAJOR_B_WIN, MINOR_B_WIN} and reverse_rating in {
        MAJOR_A_WIN,
        MINOR_A_WIN,
    }:
        return True
    if reverse_rating in {MAJOR_A_WIN, MINOR_A_WIN} and rating in {
        MAJOR_B_WIN,
        MINOR_B_WIN,
    }:
        return True
    if reverse_rating in {MAJOR_B_WIN, MINOR_B_WIN} and rating in {
        MAJOR_A_WIN,
        MINOR_A_WIN,
    }:
        return True
    if reverse_rating in {TIE} and rating in {TIE}:
        return True
    if reverse_rating in {TIE} and rating not in {TIE}:
        return False
    if rating in {TIE} and reverse_rating not in {TIE}:
        return False
    return False


# Function to convert PIL image to base64
def pil_to_base64(img):
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return img_str


# Load your dataframes
df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
df_responses = pd.read_json("data/responses.jsonl", lines=True)
df_response_judging = pd.read_json("data/response_judging.jsonl", lines=True)
df_leaderboard = (
    pd.read_csv("data/leaderboard_6_11.csv").sort_values("Rank").reset_index(drop=True)
)
df_leaderboard = df_leaderboard.rename(columns={"EI Score": "EI Score (95% CI)"})

# Prepare the scenario selector options
df_test_set["scenario_option"] = (
    df_test_set["emobench_id"].astype(str) + ": " + df_test_set["scenario"]
)
scenario_options = df_test_set["scenario_option"].tolist()

# Prepare the model selector options
model_options = df_responses["llm_responder"].unique().tolist()

# Prepare the judge selector options
judge_options = df_response_judging["llm_judge"].unique().tolist()

st.set_page_config(page_title="Language Model Council", page_icon="🧊", layout="wide")

# Create three columns
col1, col2, col3 = st.columns(3)

# Define CSS to make buttons take full space
full_width_button_css = """
<style>
div.stButton > button {
    width: 100%;
}
</style>
"""

st.markdown(full_width_button_css, unsafe_allow_html=True)

# Place a button in each column
with col1:
    if st.button("Blog"):
        st.write("Button 1 clicked")

with col2:
    if st.button("Paper"):
        st.write("Button 2 clicked")

with col3:
    if st.button("Github"):
        st.write("Button 3 clicked")

# Custom CSS to center title and header
center_css = """
<style>
h1, h2{
    text-align: center;
}
</style>
"""

st.markdown(center_css, unsafe_allow_html=True)

# Load an image
image = Image.open("img/lmc_icon.png")

# Convert the image to base64
img_base64 = pil_to_base64(image)

# HTML to center the image and embed base64 image
centered_image_html = f"""
<div style="text-align: center;">
    <img src="data:image/png;base64,{img_base64}" width="50"/>
</div>
"""

# Rendering the centered image
st.markdown(centered_image_html, unsafe_allow_html=True)

st.title("Language Model Council")
st.subheader("Benchmarking Foundation Models on Highly Subjective Tasks by Consensus")

# Create horizontal tabs
tabs = st.tabs(["Leaderboard Results", "Data Samples", "About Us"])

# Define content for each tab
with tabs[0]:
    st.dataframe(df_leaderboard)

with tabs[1]:
    st.markdown("### 1. Select a scenario.")
    # Create the selectors
    selected_scenario = st.selectbox(
        "Select Scenario", scenario_options, label_visibility="hidden"
    )

    # Get the selected scenario details
    if selected_scenario:
        selected_emobench_id = int(selected_scenario.split(": ")[0])
        scenario_details = df_test_set[
            df_test_set["emobench_id"] == selected_emobench_id
        ].iloc[0]

        # Display the detailed dilemma and additional information
        st.write(scenario_details["detailed_dilemma"])
        with st.expander("Additional Information"):
            st.write(f"**LLM Author:** {scenario_details['llm_author']}")
            st.write(f"**Problem:** {scenario_details['problem']}")
            st.write(f"**Relationship:** {scenario_details['relationship']}")
            st.write(f"**Scenario:** {scenario_details['scenario']}")

    st.divider()

    st.markdown("### 2. View responses.")

    # Create two columns for model selectors
    col1, col2 = st.columns(2)

    with col1:
        fixed_model = "qwen1.5-32B-Chat"
        st.selectbox(
            "Select Model", [fixed_model], key="fixed_model", label_visibility="hidden"
        )

        # Get the response string for the fixed model
        if selected_scenario:
            response_details_fixed = df_responses[
                (df_responses["emobench_id"] == selected_emobench_id)
                & (df_responses["llm_responder"] == fixed_model)
            ].iloc[0]

            # Display the response string
            st.write(response_details_fixed["response_string"])

    with col2:
        selected_model = st.selectbox(
            "Select Model", model_options, key="dynamic_model"
        )

        # Get the response string for the selected model
        if selected_model and selected_scenario:
            response_details_dynamic = df_responses[
                (df_responses["emobench_id"] == selected_emobench_id)
                & (df_responses["llm_responder"] == selected_model)
            ].iloc[0]

            # Display the response string
            st.write(response_details_dynamic["response_string"])

    st.divider()

    # Add bar charts for value counts of pairwise choices over all judges
    st.markdown("### 3. Response judging")

    st.markdown("#### All council members")
    col1, col2 = st.columns(2)

    with col1:
        st.write(f"**{fixed_model}** vs **{selected_model}**")
        pairwise_counts_left = df_response_judging[
            (df_response_judging["first_completion_by"] == fixed_model)
            & (df_response_judging["second_completion_by"] == selected_model)
        ]["pairwise_choice"].value_counts()
        st.bar_chart(pairwise_counts_left)

    with col2:
        st.write(f"**{selected_model}** vs **{fixed_model}**")
        pairwise_counts_right = df_response_judging[
            (df_response_judging["first_completion_by"] == selected_model)
            & (df_response_judging["second_completion_by"] == fixed_model)
        ]["pairwise_choice"].value_counts()

        st.bar_chart(pairwise_counts_right)

    # Create the llm_judge selector
    # st.write("**Select an individual judge for detailed inpsection.**")
    st.markdown("#### Individudal LLM judges")
    selected_judge = st.selectbox(
        "Select Judge", judge_options, label_visibility="hidden"
    )

    # Get the judging details for the selected judge and models
    if selected_judge and selected_scenario:
        col1, col2 = st.columns(2)

        judging_details_left = df_response_judging[
            (df_response_judging["llm_judge"] == selected_judge)
            & (df_response_judging["first_completion_by"] == fixed_model)
            & (df_response_judging["second_completion_by"] == selected_model)
        ].iloc[0]

        judging_details_right = df_response_judging[
            (df_response_judging["llm_judge"] == selected_judge)
            & (df_response_judging["first_completion_by"] == selected_model)
            & (df_response_judging["second_completion_by"] == fixed_model)
        ].iloc[0]

        if is_consistent(
            judging_details_left["pairwise_choice"],
            judging_details_right["pairwise_choice"],
        ):
            st.success("The judge ratings are consistent.", icon="✅")
        else:
            st.warning("The judge ratings are inconsistent.", icon="⚠️")

        # Display the judging details
        with col1:
            # st.write(f"**{fixed_model}** vs **{selected_model}**")
            if not judging_details_left.empty:
                st.write(
                    f"**Pairwise Choice:** {judging_details_left['pairwise_choice']}"
                )
                st.code(judging_details_left["judging_response_string"])
            else:
                st.write("No judging details found for the selected combination.")

        with col2:
            # st.write(f"**{selected_model}** vs **{fixed_model}**")
            if not judging_details_right.empty:
                st.write(
                    f"**Pairwise Choice:** {judging_details_right['pairwise_choice']}"
                )
                st.code(judging_details_right["judging_response_string"])
            else:
                st.write("No judging details found for the selected combination.")

with tabs[2]:
    st.write("This is the about us page.")
    # Add your about us content here
    st.write(
        """
    **Our Mission:**
    To provide the best service and data insights.

    **Our Team:**
    - Alice
    - Bob
    - Charlie
    """
    )