Spaces:

davidkim205
/

ko-bench

Running

App Files Files Community

ko-bench / app.py

davidkim205

add claud-3-5 results

174062d 3 months ago

raw

history blame

17.3 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import random
	import plotly.graph_objects as go
	from bs4 import BeautifulSoup
	import plotly.express as px

	file_result_score = 'ko_bench.csv'

	file_full_lb = 'mt_bench_240805.csv'


	def add_hf_link(row):
	organization, model = row['model'].split('__')
	if organization.lower() not in ['google', 'openai', 'anthropic']:
	row['link'] = f"https://huggingface.co/{organization}/{model}"
	if organization.lower() == 'google' and 'gemini' in model:
	row['link'] = "https://ai.google.dev/gemini-api"
	return row

	# read csv
	df_result_score = pd.read_csv(file_result_score)
	df_full_lb = pd.read_csv(file_full_lb)

	# dataframe
	df = pd.DataFrame(df_result_score)
	df['model'] = df['model'].str.split('__').str[1]

	df_rs = pd.DataFrame(df_result_score)
	df_rs['link'] = ''
	df_rs = df_rs.apply(add_hf_link, axis=1)
	df_rs['organization'] = df_rs['model'].str.split('__').str[0]
	df_rs['model'] = df_rs['model'].str.split('__').str[1]
	df_full_lboard = pd.DataFrame(df_full_lb)

	df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench의 GPT-4-1106-preview 를 gpt-4-0125-preview로 변경
	df_rs.replace("", np.nan, inplace=True) # 모델별 turn1,2 score 합병

	def custom_mean(series):
	if series.name == 'link' or series.name == 'organization':
	return series.values[0]
	numeric_series = pd.to_numeric(series, errors='coerce') # 시리즈를 숫자로 변환
	return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN이 아닌 값이 하나라도 있으면 평균 계산

	def get_mt_bench(model): # 대소문자 무시하고 모델을 매칭하기 위한 함수 정의
	model_lower = model.lower()
	matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
	if not matching_rows.empty:
	return matching_rows['MT-bench (score)'].values[0]
	return ''

	def get_organization(row): # 대소문자 무시하고 모델을 매칭하기 위한 함수 정의
	model = row['model']
	if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
	return 'Mistral'
	elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
	return 'KISTI'

	model_lower = model.lower()
	matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
	if not matching_rows.empty:
	return matching_rows['Organization'].values[0]

	if row['organization'] != '' and pd.notna(row['organization']):
	organization = row['organization'].lower()
	if organization == 'qwen':
	return 'Alibaba'
	elif organization == 'google':
	return 'Google'
	elif organization == 'lgai-exaone':
	return 'LGAI'

	return row['organization']

	def get_license(model): # 대소문자 무시하고 모델을 매칭하기 위한 함수 정의
	if pd.Series(model).str.contains('mistral-large\|WizardLM-2-8x22B\|ko-gemma-2', case=False, regex=True).any():
	return 'Apache-2.0'
	elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
	return 'llama3'
	elif pd.Series(model).str.contains('Ko-Llama-3-8B-Instruct', case=False, regex=True).any():
	return 'Llama Community'
	elif pd.Series(model).str.contains('claude\|gemini\|EXAONE-3.0-7.8B-Instruct', case=False, regex=True).any():
	return 'Proprietary'
	elif pd.Series(model).str.contains('qwen', case=False, regex=True).any():
	if pd.Series(model).str.contains('max', case=False, regex=True).any():
	return 'Proprietary'
	else:
	return 'Qianwen LICENSE'

	model_lower = model.lower()
	matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
	if not matching_rows.empty:
	return matching_rows['License'].values[0]
	return ''

	def get_link(row): # 대소문자 무시하고 모델을 매칭하기 위한 함수 정의
	if row['link'] != '' and pd.notna(row['link']):
	return row

	model_lower = row['model'].lower()
	matching_rows = df_full_lboard[df_full_lboard['key'].str.lower() == model_lower]
	if not matching_rows.empty:
	row['link'] = matching_rows['Link'].values[0]
	return row

	def add_link(row):
	if pd.isna(row['link']):
	row['link'] = ''
	if row['link'] != '':
	row['model'] = f"<a href={row['link']}>{row['model']}</a>"
	return row

	# dataframe_full
	df_full_rs = df_rs.copy()
	df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True)
	df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])

	df_full_rs = df_full_rs.drop(columns=['turn']) # 모델별 turn1,2 score 합병
	df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model', 'judge_model']}).reset_index()
	df_full_rs = df_full_rs.round(2)
	df_full_rs.replace("", np.nan, inplace=True)

	df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval 열 추가
	df_full_rs['Ko-Bench/keval'] = ''
	for idx, j_model in df_full_rs['judge_model'].items():
	if j_model == 'keval':
	df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench']
	else :
	df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench']
	df_full_rs = df_full_rs.drop(columns=['judge_model'])

	df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval 행 합병
	df_full_rs = df_full_rs.round(2)
	df_full_rs.replace("", np.nan, inplace=True)

	df_full_rs['MT-Bench'] = '' # MT-Bench 열 추가
	df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench)
	df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)

	df_full_rs['Organization'] = '' # Organization 열 추가
	df_full_rs['Organization'] = df_full_rs.apply(get_organization, axis=1 )

	df_full_rs['License'] = '' # License 열 추가
	df_full_rs['License'] = df_full_rs['model'].apply(get_license)

	df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False)
	df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))

	plot_models = df_full_rs['model'].unique() # model detail view를 위한 models 리스트

	df_full_rs = df_full_rs.apply(get_link, axis=1)
	df_full_rs = df_full_rs.apply(add_link, axis=1)

	df_full_rs = df_full_rs.drop(columns=['Ko-Bench', 'link', 'organization'])

	# dataframe
	df_rs['MT-Bench'] = '' # MT-Bench 열 추가
	df_rs['MT-Bench'] = df_rs['model'].apply(get_mt_bench)
	df_rs['MT-Bench'] = df_rs['MT-Bench'].str.replace('-', '', regex=False)

	df_rs.replace("", np.nan, inplace=True) # 모델별 turn1,2 score 합병


	# dataframe_openai
	df_openai = pd.DataFrame(df_rs)
	df_openai = df_openai[df_openai['judge_model'] != 'keval']

	df_openai = df_openai.drop(columns=['judge_model', 'turn']) # 모델별 turn1,2 score 합병
	df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
	df_openai = df_openai.round(2)

	df_openai = df_openai.apply(get_link, axis=1)
	df_openai = df_openai.apply(add_link, axis=1)
	df_openai = df_openai.drop(columns=['link', 'organization'])

	df_openai = df_openai.sort_values(by='score', ascending=False)
	df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))


	# dataframe_keval
	df_keval = pd.DataFrame(df_rs)
	df_keval = df_keval[df_keval['judge_model'] == 'keval']

	df_keval = df_keval.drop(columns=['judge_model', 'turn']) # 모델별 turn1,2 score 합병
	df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
	df_keval = df_keval.round(2)

	df_keval = df_keval.apply(get_link, axis=1)
	df_keval = df_keval.apply(add_link, axis=1)
	df_keval = df_keval.drop(columns=['link', 'organization'])

	df_keval = df_keval.sort_values(by='score', ascending=False)
	df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))


	# model detail view
	plot_models_list = plot_models.tolist()
	CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
	colors_openai = ['#ff0000', '#ff1493', '#115e02', '#21ad05']
	colors_keval = ['#ff0000', '#ff1493', '#0000ff', '#0592eb']
	random.seed(42)

	def search_dataframe(query): # df 검색 함수 정의
	if not query:
	return df # 검색어가 없을 경우 전체 DataFrame 반환
	filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)]
	return filtered_df

	def radar_chart(categories, Top1_turn1, Top1_turn2, Selected_model_turn1, Selected_model_turn2, category_labels, str): # plot 그리는 함수
	#categories = categories.split(',')
	Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist]
	Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist]
	Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist]
	Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist]

	values_lists = [
	list(map(float, Top1_turn1)),
	list(map(float, Top1_turn2)),
	list(map(float, Selected_model_turn1)),
	list(map(float, Selected_model_turn2))
	]

	if str == "openai": colors = colors_openai
	else: colors = colors_keval
	if str == "openai": title_text = "< Openai >"
	else: title_text = "< Keval >"

	fig = go.Figure()

	for i, values in enumerate(values_lists):
	if len(categories) != len(values):
	return f"Error in dataset {i+1}: Number of categories and values must be the same."
	fig.add_trace(go.Scatterpolar(
	r=values + [values[0]], # Closing the loop of the radar chart
	theta=categories + [categories[0]], # Closing the loop of the radar chart
	mode='lines',
	name=category_labels[i], # Label for the dataset
	line = dict(color= colors[i])
	))

	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, max(max(values) for values in values_lists)],
	showline=True,
	),
	angularaxis=dict(
	rotation=0,
	direction='clockwise'
	)
	),
	showlegend=True,
	#width=650, # 적절한 너비 설정
	#height=650, # 적절한 높이 설정
	margin=dict(l=1000, r=20, t=20, b=20),
	#autosize = False,
	paper_bgcolor='white',
	plot_bgcolor='lightgrey',
	title=dict(
	text=title_text, # 제목을 원하는 텍스트로 변경
	x=0.5, # 제목의 x 위치 (0=왼쪽, 0.5=중앙, 1=오른쪽)
	xanchor='center', # 제목의 x 위치 기준 (center, left, right)
	y=0.95, # 제목의 y 위치 (0=하단, 1=상단)
	yanchor='top' # 제목의 y 위치 기준 (top, middle, bottom)
	)
	)
	return fig

	def search_openai_plot(dropdown_model): # openai plot 함수 정의
	openai_top_model = df_openai.iat[0, df_openai.columns.get_loc('model')]
	openai_top_model = BeautifulSoup(openai_top_model, 'html.parser').get_text()

	condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == openai_top_model)
	top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()

	condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == openai_top_model)
	top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()

	condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
	openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()

	condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
	openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()

	category_labels = []
	category_labels.append(openai_top_model + " /Turn 1")
	category_labels.append(openai_top_model + " /Turn 2")
	category_labels.append(dropdown_model + " /Turn 1")
	category_labels.append(dropdown_model + " /Turn 2")

	fig = radar_chart(CATEGORIES, top1_openai_turn1, top1_openai_turn2, openai_turn1, openai_turn2, category_labels,"openai")
	return fig

	def search_keval_plot(dropdown_model): # keval plot 함수 정의
	keval_top_model = df_keval.iat[0, df_keval.columns.get_loc('model')]
	keval_top_model = BeautifulSoup(keval_top_model, 'html.parser').get_text()

	condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == keval_top_model)
	top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()

	condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == keval_top_model)
	top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()

	condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
	keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()

	condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
	keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()

	category_labels = []
	category_labels.append(keval_top_model + " /Turn 1")
	category_labels.append(keval_top_model + " /Turn 2")
	category_labels.append(dropdown_model + " /Turn 1")
	category_labels.append(dropdown_model + " /Turn 2")

	fig = radar_chart(CATEGORIES, top1_keval_turn1, top1_keval_turn2, keval_turn1, keval_turn2, category_labels, "keval")
	return fig


	# average
	def plot_average():
	fig = go.Figure()
	colors = [px.colors.qualitative.Set2, px.colors.qualitative.Pastel2]
	turn_df = df_full_rs

	# gpt-4o
	fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/openai'], mode='lines+markers',
	name=f'gpt-4o(Average)',
	line=dict(color=colors[0][0], dash='dash'),
	marker=dict(symbol='x', size=10)))

	# keval
	fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/keval'], mode='lines+markers',
	name=f'keval(Average)',
	line=dict(color=colors[0][1]),
	marker=dict(symbol='circle', size=10)))

	fig.update_layout(
	title=f'Comparison of OpenAI ko_bench and keval ko_bench (Average)',
	xaxis_title='Model',
	yaxis_title='Score',
	legend_title='Metric',
	hovermode='x unified',
	template='plotly_white'
	)
	fig.update_yaxes(range=[0, 10])
	fig.update_layout(legend_traceorder="reversed")
	return fig


	#gradio
	with gr.Blocks(css='assets/leaderboard.css') as demo:
	gr.Markdown("")
	gr.Markdown("# 🏆 Ko-Bench Leaderboard")
	gr.Markdown("")
	gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
	gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
	gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
	gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
	gr.Markdown("")
	gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
	gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
	gr.Markdown("")

	with gr.Row():
	with gr.TabItem("Ko-Bench"):
	gr.Dataframe(value=df_full_rs,
	datatype=['html' if col == 'model' else 'markdown' for col in df_full_rs.columns])
	with gr.Row():
	with gr.TabItem("Average"):
	gr.Plot(plot_average)
	with gr.TabItem("Openai Judgment"):
	gr.Dataframe(value=df_openai,
	datatype=['html' if col == 'model' else 'markdown' for col in df_openai.columns])
	with gr.TabItem("Keval Judgment"):
	gr.Dataframe(value=df_keval,
	datatype=['html' if col == 'model' else 'markdown' for col in df_keval.columns])
	with gr.TabItem("Model Detail View"):
	with gr.Blocks():
	with gr.Row():
	dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
	with gr.Row():
	dataframe = gr.Dataframe(label="Model Detail View")
	dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
	with gr.Row():
	plot_openai = gr.Plot(label="Openai Plot")
	dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
	plot_keval = gr.Plot(label="Keval Plot")
	dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)



	demo.launch(share=True, server_name="0.0.0.0", debug=True)