Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import random | |
import plotly.graph_objects as go | |
from bs4 import BeautifulSoup | |
import plotly.express as px | |
file_result_score = 'ko_bench.csv' | |
file_full_lb = 'mt_bench_240805.csv' | |
def add_hf_link(row): | |
organization, model = row['model'].split('__') | |
if organization.lower() not in ['google', 'openai', 'anthropic']: | |
row['link'] = f"https://huggingface.co/{organization}/{model}" | |
if organization.lower() == 'google' and 'gemini' in model: | |
row['link'] = "https://ai.google.dev/gemini-api" | |
return row | |
# read csv | |
df_result_score = pd.read_csv(file_result_score) | |
df_full_lb = pd.read_csv(file_full_lb) | |
# dataframe | |
df = pd.DataFrame(df_result_score) | |
df['model'] = df['model'].str.split('__').str[1] | |
df_rs = pd.DataFrame(df_result_score) | |
df_rs['link'] = '' | |
df_rs = df_rs.apply(add_hf_link, axis=1) | |
df_rs['organization'] = df_rs['model'].str.split('__').str[0] | |
df_rs['model'] = df_rs['model'].str.split('__').str[1] | |
df_full_lboard = pd.DataFrame(df_full_lb) | |
df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋ก ๋ณ๊ฒฝ | |
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ | |
def custom_mean(series): | |
if series.name == 'link' or series.name == 'organization': | |
return series.values[0] | |
numeric_series = pd.to_numeric(series, errors='coerce') # ์๋ฆฌ์ฆ๋ฅผ ์ซ์๋ก ๋ณํ | |
return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์๋ ๊ฐ์ด ํ๋๋ผ๋ ์์ผ๋ฉด ํ๊ท ๊ณ์ฐ | |
def get_mt_bench(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์ | |
model_lower = model.lower() | |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower] | |
if not matching_rows.empty: | |
return matching_rows['MT-bench (score)'].values[0] | |
return '' | |
def get_organization(row): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์ | |
model = row['model'] | |
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any(): | |
return 'Mistral' | |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any(): | |
return 'KISTI' | |
model_lower = model.lower() | |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower] | |
if not matching_rows.empty: | |
return matching_rows['Organization'].values[0] | |
if row['organization'] != '' and pd.notna(row['organization']): | |
organization = row['organization'].lower() | |
if organization == 'qwen': | |
return 'Alibaba' | |
elif organization == 'google': | |
return 'Google' | |
elif organization == 'lgai-exaone': | |
return 'LGAI' | |
return row['organization'] | |
def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์ | |
if pd.Series(model).str.contains('mistral-large|WizardLM-2-8x22B|ko-gemma-2', case=False, regex=True).any(): | |
return 'Apache-2.0' | |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any(): | |
return 'llama3' | |
elif pd.Series(model).str.contains('Ko-Llama-3-8B-Instruct', case=False, regex=True).any(): | |
return 'Llama Community' | |
elif pd.Series(model).str.contains('claude|gemini|EXAONE-3.0-7.8B-Instruct', case=False, regex=True).any(): | |
return 'Proprietary' | |
elif pd.Series(model).str.contains('qwen', case=False, regex=True).any(): | |
if pd.Series(model).str.contains('max', case=False, regex=True).any(): | |
return 'Proprietary' | |
else: | |
return 'Qianwen LICENSE' | |
model_lower = model.lower() | |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower] | |
if not matching_rows.empty: | |
return matching_rows['License'].values[0] | |
return '' | |
def get_link(row): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์ | |
if row['link'] != '' and pd.notna(row['link']): | |
return row | |
model_lower = row['model'].lower() | |
matching_rows = df_full_lboard[df_full_lboard['key'].str.lower() == model_lower] | |
if not matching_rows.empty: | |
row['link'] = matching_rows['Link'].values[0] | |
return row | |
def add_link(row): | |
if pd.isna(row['link']): | |
row['link'] = '' | |
if row['link'] != '': | |
row['model'] = f"<a href={row['link']}>{row['model']}</a>" | |
return row | |
# dataframe_full | |
df_full_rs = df_rs.copy() | |
df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True) | |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing']) | |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ | |
df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model', 'judge_model']}).reset_index() | |
df_full_rs = df_full_rs.round(2) | |
df_full_rs.replace("", np.nan, inplace=True) | |
df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval ์ด ์ถ๊ฐ | |
df_full_rs['Ko-Bench/keval'] = '' | |
for idx, j_model in df_full_rs['judge_model'].items(): | |
if j_model == 'keval': | |
df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench'] | |
else : | |
df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench'] | |
df_full_rs = df_full_rs.drop(columns=['judge_model']) | |
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval ํ ํฉ๋ณ | |
df_full_rs = df_full_rs.round(2) | |
df_full_rs.replace("", np.nan, inplace=True) | |
df_full_rs['MT-Bench'] = '' # MT-Bench ์ด ์ถ๊ฐ | |
df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench) | |
df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False) | |
df_full_rs['Organization'] = '' # Organization ์ด ์ถ๊ฐ | |
df_full_rs['Organization'] = df_full_rs.apply(get_organization, axis=1 ) | |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ | |
df_full_rs['License'] = df_full_rs['model'].apply(get_license) | |
df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False) | |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1)) | |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ | |
df_full_rs = df_full_rs.apply(get_link, axis=1) | |
df_full_rs = df_full_rs.apply(add_link, axis=1) | |
df_full_rs = df_full_rs.drop(columns=['Ko-Bench', 'link', 'organization']) | |
# dataframe | |
df_rs['MT-Bench'] = '' # MT-Bench ์ด ์ถ๊ฐ | |
df_rs['MT-Bench'] = df_rs['model'].apply(get_mt_bench) | |
df_rs['MT-Bench'] = df_rs['MT-Bench'].str.replace('-', '', regex=False) | |
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ | |
# dataframe_openai | |
df_openai = pd.DataFrame(df_rs) | |
df_openai = df_openai[df_openai['judge_model'] != 'keval'] | |
df_openai = df_openai.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ | |
df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index() | |
df_openai = df_openai.round(2) | |
df_openai = df_openai.apply(get_link, axis=1) | |
df_openai = df_openai.apply(add_link, axis=1) | |
df_openai = df_openai.drop(columns=['link', 'organization']) | |
df_openai = df_openai.sort_values(by='score', ascending=False) | |
df_openai.insert(0, 'rank', range(1, len(df_openai) + 1)) | |
# dataframe_keval | |
df_keval = pd.DataFrame(df_rs) | |
df_keval = df_keval[df_keval['judge_model'] == 'keval'] | |
df_keval = df_keval.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ | |
df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index() | |
df_keval = df_keval.round(2) | |
df_keval = df_keval.apply(get_link, axis=1) | |
df_keval = df_keval.apply(add_link, axis=1) | |
df_keval = df_keval.drop(columns=['link', 'organization']) | |
df_keval = df_keval.sort_values(by='score', ascending=False) | |
df_keval.insert(0, 'rank', range(1, len(df_keval) + 1)) | |
# model detail view | |
plot_models_list = plot_models.tolist() | |
CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"] | |
colors_openai = ['#ff0000', '#ff1493', '#115e02', '#21ad05'] | |
colors_keval = ['#ff0000', '#ff1493', '#0000ff', '#0592eb'] | |
random.seed(42) | |
def search_dataframe(query): # df ๊ฒ์ ํจ์ ์ ์ | |
if not query: | |
return df # ๊ฒ์์ด๊ฐ ์์ ๊ฒฝ์ฐ ์ ์ฒด DataFrame ๋ฐํ | |
filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)] | |
return filtered_df | |
def radar_chart(categories, Top1_turn1, Top1_turn2, Selected_model_turn1, Selected_model_turn2, category_labels, str): # plot ๊ทธ๋ฆฌ๋ ํจ์ | |
#categories = categories.split(',') | |
Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist] | |
Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist] | |
Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist] | |
Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist] | |
values_lists = [ | |
list(map(float, Top1_turn1)), | |
list(map(float, Top1_turn2)), | |
list(map(float, Selected_model_turn1)), | |
list(map(float, Selected_model_turn2)) | |
] | |
if str == "openai": colors = colors_openai | |
else: colors = colors_keval | |
if str == "openai": title_text = "< Openai >" | |
else: title_text = "< Keval >" | |
fig = go.Figure() | |
for i, values in enumerate(values_lists): | |
if len(categories) != len(values): | |
return f"Error in dataset {i+1}: Number of categories and values must be the same." | |
fig.add_trace(go.Scatterpolar( | |
r=values + [values[0]], # Closing the loop of the radar chart | |
theta=categories + [categories[0]], # Closing the loop of the radar chart | |
mode='lines', | |
name=category_labels[i], # Label for the dataset | |
line = dict(color= colors[i]) | |
)) | |
fig.update_layout( | |
polar=dict( | |
radialaxis=dict( | |
visible=True, | |
range=[0, max(max(values) for values in values_lists)], | |
showline=True, | |
), | |
angularaxis=dict( | |
rotation=0, | |
direction='clockwise' | |
) | |
), | |
showlegend=True, | |
#width=650, # ์ ์ ํ ๋๋น ์ค์ | |
#height=650, # ์ ์ ํ ๋์ด ์ค์ | |
margin=dict(l=1000, r=20, t=20, b=20), | |
#autosize = False, | |
paper_bgcolor='white', | |
plot_bgcolor='lightgrey', | |
title=dict( | |
text=title_text, # ์ ๋ชฉ์ ์ํ๋ ํ ์คํธ๋ก ๋ณ๊ฒฝ | |
x=0.5, # ์ ๋ชฉ์ x ์์น (0=์ผ์ชฝ, 0.5=์ค์, 1=์ค๋ฅธ์ชฝ) | |
xanchor='center', # ์ ๋ชฉ์ x ์์น ๊ธฐ์ค (center, left, right) | |
y=0.95, # ์ ๋ชฉ์ y ์์น (0=ํ๋จ, 1=์๋จ) | |
yanchor='top' # ์ ๋ชฉ์ y ์์น ๊ธฐ์ค (top, middle, bottom) | |
) | |
) | |
return fig | |
def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์ | |
openai_top_model = df_openai.iat[0, df_openai.columns.get_loc('model')] | |
openai_top_model = BeautifulSoup(openai_top_model, 'html.parser').get_text() | |
condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == openai_top_model) | |
top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist() | |
condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == openai_top_model) | |
top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist() | |
condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model) | |
openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist() | |
condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model) | |
openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist() | |
category_labels = [] | |
category_labels.append(openai_top_model + " /Turn 1") | |
category_labels.append(openai_top_model + " /Turn 2") | |
category_labels.append(dropdown_model + " /Turn 1") | |
category_labels.append(dropdown_model + " /Turn 2") | |
fig = radar_chart(CATEGORIES, top1_openai_turn1, top1_openai_turn2, openai_turn1, openai_turn2, category_labels,"openai") | |
return fig | |
def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์ | |
keval_top_model = df_keval.iat[0, df_keval.columns.get_loc('model')] | |
keval_top_model = BeautifulSoup(keval_top_model, 'html.parser').get_text() | |
condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == keval_top_model) | |
top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist() | |
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == keval_top_model) | |
top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist() | |
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model) | |
keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist() | |
condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model) | |
keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist() | |
category_labels = [] | |
category_labels.append(keval_top_model + " /Turn 1") | |
category_labels.append(keval_top_model + " /Turn 2") | |
category_labels.append(dropdown_model + " /Turn 1") | |
category_labels.append(dropdown_model + " /Turn 2") | |
fig = radar_chart(CATEGORIES, top1_keval_turn1, top1_keval_turn2, keval_turn1, keval_turn2, category_labels, "keval") | |
return fig | |
# average | |
def plot_average(): | |
fig = go.Figure() | |
colors = [px.colors.qualitative.Set2, px.colors.qualitative.Pastel2] | |
turn_df = df_full_rs | |
# gpt-4o | |
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/openai'], mode='lines+markers', | |
name=f'gpt-4o(Average)', | |
line=dict(color=colors[0][0], dash='dash'), | |
marker=dict(symbol='x', size=10))) | |
# keval | |
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/keval'], mode='lines+markers', | |
name=f'keval(Average)', | |
line=dict(color=colors[0][1]), | |
marker=dict(symbol='circle', size=10))) | |
fig.update_layout( | |
title=f'Comparison of OpenAI ko_bench and keval ko_bench (Average)', | |
xaxis_title='Model', | |
yaxis_title='Score', | |
legend_title='Metric', | |
hovermode='x unified', | |
template='plotly_white' | |
) | |
fig.update_yaxes(range=[0, 10]) | |
fig.update_layout(legend_traceorder="reversed") | |
return fig | |
#gradio | |
with gr.Blocks(css='assets/leaderboard.css') as demo: | |
gr.Markdown("") | |
gr.Markdown("# ๐ Ko-Bench Leaderboard") | |
gr.Markdown("") | |
gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).") | |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.") | |
gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.") | |
gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.") | |
gr.Markdown("") | |
gr.Markdown("github : https://github.com/davidkim205/Ko-Bench") | |
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1") | |
gr.Markdown("") | |
with gr.Row(): | |
with gr.TabItem("Ko-Bench"): | |
gr.Dataframe(value=df_full_rs, | |
datatype=['html' if col == 'model' else 'markdown' for col in df_full_rs.columns]) | |
with gr.Row(): | |
with gr.TabItem("Average"): | |
gr.Plot(plot_average) | |
with gr.TabItem("Openai Judgment"): | |
gr.Dataframe(value=df_openai, | |
datatype=['html' if col == 'model' else 'markdown' for col in df_openai.columns]) | |
with gr.TabItem("Keval Judgment"): | |
gr.Dataframe(value=df_keval, | |
datatype=['html' if col == 'model' else 'markdown' for col in df_keval.columns]) | |
with gr.TabItem("Model Detail View"): | |
with gr.Blocks(): | |
with gr.Row(): | |
dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model") | |
with gr.Row(): | |
dataframe = gr.Dataframe(label="Model Detail View") | |
dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe) | |
with gr.Row(): | |
plot_openai = gr.Plot(label="Openai Plot") | |
dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai) | |
plot_keval = gr.Plot(label="Keval Plot") | |
dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval) | |
demo.launch(share=True, server_name="0.0.0.0", debug=True) | |