davidkim205 commited on
Commit
174062d
โ€ข
1 Parent(s): 731e515

add claud-3-5 results

Browse files
Files changed (2) hide show
  1. app.py +157 -48
  2. ko_bench.csv +92 -88
app.py CHANGED
@@ -3,27 +3,43 @@ import pandas as pd
3
  import numpy as np
4
  import random
5
  import plotly.graph_objects as go
 
 
6
 
7
  file_result_score = 'ko_bench.csv'
8
 
9
  file_full_lb = 'mt_bench_240805.csv'
10
 
11
 
 
 
 
 
 
 
 
 
12
  # read csv
13
  df_result_score = pd.read_csv(file_result_score)
14
  df_full_lb = pd.read_csv(file_full_lb)
15
 
16
-
17
  # dataframe
18
  df = pd.DataFrame(df_result_score)
 
 
19
  df_rs = pd.DataFrame(df_result_score)
 
 
 
 
20
  df_full_lboard = pd.DataFrame(df_full_lb)
21
 
22
  df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์˜ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋กœ ๋ณ€๊ฒฝ
23
- models = df_full_lboard['Model'].unique() # ์—ด ์ถ”๊ฐ€๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
24
  df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
25
 
26
  def custom_mean(series):
 
 
27
  numeric_series = pd.to_numeric(series, errors='coerce') # ์‹œ๋ฆฌ์ฆˆ๋ฅผ ์ˆซ์ž๋กœ ๋ณ€ํ™˜
28
  return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์•„๋‹Œ ๊ฐ’์ด ํ•˜๋‚˜๋ผ๋„ ์žˆ์œผ๋ฉด ํ‰๊ท  ๊ณ„์‚ฐ
29
 
@@ -34,7 +50,8 @@ def get_mt_bench(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„
34
  return matching_rows['MT-bench (score)'].values[0]
35
  return ''
36
 
37
- def get_organization(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
 
38
  if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
39
  return 'Mistral'
40
  elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
@@ -44,13 +61,32 @@ def get_organization(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ
44
  matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
45
  if not matching_rows.empty:
46
  return matching_rows['Organization'].values[0]
47
- return ''
 
 
 
 
 
 
 
 
 
 
48
 
49
  def get_license(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
50
- if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
51
  return 'Apache-2.0'
52
  elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
53
  return 'llama3'
 
 
 
 
 
 
 
 
 
54
 
55
  model_lower = model.lower()
56
  matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
@@ -58,10 +94,26 @@ def get_license(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„
58
  return matching_rows['License'].values[0]
59
  return ''
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # dataframe_full
63
  df_full_rs = df_rs.copy()
64
- df_full_rs.rename(columns={'score': 'KO-Bench'}, inplace=True)
65
  df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
66
 
67
  df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
@@ -69,16 +121,16 @@ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean
69
  df_full_rs = df_full_rs.round(2)
70
  df_full_rs.replace("", np.nan, inplace=True)
71
 
72
- df_full_rs['KO-Bench/openai'] = '' # KO-Bench/openai, KO-Bench/keval ์—ด ์ถ”๊ฐ€
73
- df_full_rs['KO-Bench/keval'] = ''
74
  for idx, j_model in df_full_rs['judge_model'].items():
75
  if j_model == 'keval':
76
- df_full_rs.at[idx, 'KO-Bench/keval'] = df_full_rs.at[idx, 'KO-Bench']
77
  else :
78
- df_full_rs.at[idx, 'KO-Bench/openai'] = df_full_rs.at[idx, 'KO-Bench']
79
  df_full_rs = df_full_rs.drop(columns=['judge_model'])
80
 
81
- df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # KO-Bench/openai, KO-Bench/keval ํ–‰ ํ•ฉ๋ณ‘
82
  df_full_rs = df_full_rs.round(2)
83
  df_full_rs.replace("", np.nan, inplace=True)
84
 
@@ -87,17 +139,20 @@ df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench)
87
  df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
88
 
89
  df_full_rs['Organization'] = '' # Organization ์—ด ์ถ”๊ฐ€
90
- df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
91
 
92
  df_full_rs['License'] = '' # License ์—ด ์ถ”๊ฐ€
93
  df_full_rs['License'] = df_full_rs['model'].apply(get_license)
94
 
95
- df_full_rs = df_full_rs.sort_values(by='KO-Bench', ascending=False)
96
  df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
97
- df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
98
 
99
  plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
100
 
 
 
 
 
101
 
102
  # dataframe
103
  df_rs['MT-Bench'] = '' # MT-Bench ์—ด ์ถ”๊ฐ€
@@ -115,6 +170,10 @@ df_openai = df_openai.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ„ turn1,2
115
  df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
116
  df_openai = df_openai.round(2)
117
 
 
 
 
 
118
  df_openai = df_openai.sort_values(by='score', ascending=False)
119
  df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
120
 
@@ -127,6 +186,10 @@ df_keval = df_keval.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 sc
127
  df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
128
  df_keval = df_keval.round(2)
129
 
 
 
 
 
130
  df_keval = df_keval.sort_values(by='score', ascending=False)
131
  df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
132
 
@@ -206,10 +269,13 @@ def radar_chart(categories, Top1_turn1, Top1_turn2, Selected_model_turn1, Select
206
  return fig
207
 
208
  def search_openai_plot(dropdown_model): # openai plot ํ•จ์ˆ˜ ์ •์˜
209
- condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == df_openai.iat[0, df_openai.columns.get_loc('model')])
 
 
 
210
  top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
211
 
212
- condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == df_openai.iat[0, df_openai.columns.get_loc('model')])
213
  top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
214
 
215
  condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
@@ -219,8 +285,8 @@ def search_openai_plot(dropdown_model): # openai plot ํ•จ์ˆ˜ ์ •์˜
219
  openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
220
 
221
  category_labels = []
222
- category_labels.append(df_openai.iat[0, df_openai.columns.get_loc('model')] + " /Turn 1")
223
- category_labels.append(df_openai.iat[0, df_openai.columns.get_loc('model')] + " /Turn 2")
224
  category_labels.append(dropdown_model + " /Turn 1")
225
  category_labels.append(dropdown_model + " /Turn 2")
226
 
@@ -228,10 +294,13 @@ def search_openai_plot(dropdown_model): # openai plot ํ•จ์ˆ˜ ์ •์˜
228
  return fig
229
 
230
  def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
231
- condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == df_keval.iat[0, df_keval.columns.get_loc('model')])
 
 
 
232
  top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
233
 
234
- condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == df_keval.iat[0, df_keval.columns.get_loc('model')])
235
  top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
236
 
237
  condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
@@ -241,8 +310,8 @@ def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
241
  keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
242
 
243
  category_labels = []
244
- category_labels.append(df_keval.iat[0, df_keval.columns.get_loc('model')] + " /Turn 1")
245
- category_labels.append(df_keval.iat[0, df_keval.columns.get_loc('model')] + " /Turn 2")
246
  category_labels.append(dropdown_model + " /Turn 1")
247
  category_labels.append(dropdown_model + " /Turn 2")
248
 
@@ -250,37 +319,77 @@ def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
250
  return fig
251
 
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  #gradio
254
- with gr.Blocks() as demo:
255
  gr.Markdown("")
256
- gr.Markdown("# ๐Ÿ† KO-Bench Leaderboard")
257
  gr.Markdown("")
258
- gr.Markdown("")
259
- gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
260
  gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
261
- gr.Markdown("- KO-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
262
- gr.Markdown("- KO-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
263
- gr.Markdown("")
264
  gr.Markdown("")
 
 
265
  gr.Markdown("")
266
 
267
- with gr.TabItem("KO-Bench"):
268
- gr.Dataframe(value=df_full_rs)
269
- with gr.TabItem("Openai Judgment"):
270
- gr.Dataframe(value=df_openai)
271
- with gr.TabItem("Keval Judgment"):
272
- gr.Dataframe(value=df_keval)
273
- with gr.TabItem("Model Detail View"):
274
- with gr.Blocks():
275
  with gr.Row():
276
- dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
277
- with gr.Row():
278
- dataframe = gr.Dataframe(label="Model Detail View")
279
- dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
280
- with gr.Row():
281
- plot_openai = gr.Plot(label="Openai Plot")
282
- dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
283
- plot_keval = gr.Plot(label="Keval Plot")
284
- dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
285
-
286
- demo.launch(share=True, server_name="0.0.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import numpy as np
4
  import random
5
  import plotly.graph_objects as go
6
+ from bs4 import BeautifulSoup
7
+ import plotly.express as px
8
 
9
  file_result_score = 'ko_bench.csv'
10
 
11
  file_full_lb = 'mt_bench_240805.csv'
12
 
13
 
14
+ def add_hf_link(row):
15
+ organization, model = row['model'].split('__')
16
+ if organization.lower() not in ['google', 'openai', 'anthropic']:
17
+ row['link'] = f"https://huggingface.co/{organization}/{model}"
18
+ if organization.lower() == 'google' and 'gemini' in model:
19
+ row['link'] = "https://ai.google.dev/gemini-api"
20
+ return row
21
+
22
  # read csv
23
  df_result_score = pd.read_csv(file_result_score)
24
  df_full_lb = pd.read_csv(file_full_lb)
25
 
 
26
  # dataframe
27
  df = pd.DataFrame(df_result_score)
28
+ df['model'] = df['model'].str.split('__').str[1]
29
+
30
  df_rs = pd.DataFrame(df_result_score)
31
+ df_rs['link'] = ''
32
+ df_rs = df_rs.apply(add_hf_link, axis=1)
33
+ df_rs['organization'] = df_rs['model'].str.split('__').str[0]
34
+ df_rs['model'] = df_rs['model'].str.split('__').str[1]
35
  df_full_lboard = pd.DataFrame(df_full_lb)
36
 
37
  df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์˜ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋กœ ๋ณ€๊ฒฝ
 
38
  df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
39
 
40
  def custom_mean(series):
41
+ if series.name == 'link' or series.name == 'organization':
42
+ return series.values[0]
43
  numeric_series = pd.to_numeric(series, errors='coerce') # ์‹œ๋ฆฌ์ฆˆ๋ฅผ ์ˆซ์ž๋กœ ๋ณ€ํ™˜
44
  return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์•„๋‹Œ ๊ฐ’์ด ํ•˜๋‚˜๋ผ๋„ ์žˆ์œผ๋ฉด ํ‰๊ท  ๊ณ„์‚ฐ
45
 
 
50
  return matching_rows['MT-bench (score)'].values[0]
51
  return ''
52
 
53
+ def get_organization(row): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
54
+ model = row['model']
55
  if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
56
  return 'Mistral'
57
  elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
 
61
  matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
62
  if not matching_rows.empty:
63
  return matching_rows['Organization'].values[0]
64
+
65
+ if row['organization'] != '' and pd.notna(row['organization']):
66
+ organization = row['organization'].lower()
67
+ if organization == 'qwen':
68
+ return 'Alibaba'
69
+ elif organization == 'google':
70
+ return 'Google'
71
+ elif organization == 'lgai-exaone':
72
+ return 'LGAI'
73
+
74
+ return row['organization']
75
 
76
  def get_license(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
77
+ if pd.Series(model).str.contains('mistral-large|WizardLM-2-8x22B|ko-gemma-2', case=False, regex=True).any():
78
  return 'Apache-2.0'
79
  elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
80
  return 'llama3'
81
+ elif pd.Series(model).str.contains('Ko-Llama-3-8B-Instruct', case=False, regex=True).any():
82
+ return 'Llama Community'
83
+ elif pd.Series(model).str.contains('claude|gemini|EXAONE-3.0-7.8B-Instruct', case=False, regex=True).any():
84
+ return 'Proprietary'
85
+ elif pd.Series(model).str.contains('qwen', case=False, regex=True).any():
86
+ if pd.Series(model).str.contains('max', case=False, regex=True).any():
87
+ return 'Proprietary'
88
+ else:
89
+ return 'Qianwen LICENSE'
90
 
91
  model_lower = model.lower()
92
  matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
 
94
  return matching_rows['License'].values[0]
95
  return ''
96
 
97
+ def get_link(row): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
98
+ if row['link'] != '' and pd.notna(row['link']):
99
+ return row
100
+
101
+ model_lower = row['model'].lower()
102
+ matching_rows = df_full_lboard[df_full_lboard['key'].str.lower() == model_lower]
103
+ if not matching_rows.empty:
104
+ row['link'] = matching_rows['Link'].values[0]
105
+ return row
106
+
107
+ def add_link(row):
108
+ if pd.isna(row['link']):
109
+ row['link'] = ''
110
+ if row['link'] != '':
111
+ row['model'] = f"<a href={row['link']}>{row['model']}</a>"
112
+ return row
113
 
114
  # dataframe_full
115
  df_full_rs = df_rs.copy()
116
+ df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True)
117
  df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
118
 
119
  df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
 
121
  df_full_rs = df_full_rs.round(2)
122
  df_full_rs.replace("", np.nan, inplace=True)
123
 
124
+ df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval ์—ด ์ถ”๊ฐ€
125
+ df_full_rs['Ko-Bench/keval'] = ''
126
  for idx, j_model in df_full_rs['judge_model'].items():
127
  if j_model == 'keval':
128
+ df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench']
129
  else :
130
+ df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench']
131
  df_full_rs = df_full_rs.drop(columns=['judge_model'])
132
 
133
+ df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval ํ–‰ ํ•ฉ๋ณ‘
134
  df_full_rs = df_full_rs.round(2)
135
  df_full_rs.replace("", np.nan, inplace=True)
136
 
 
139
  df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
140
 
141
  df_full_rs['Organization'] = '' # Organization ์—ด ์ถ”๊ฐ€
142
+ df_full_rs['Organization'] = df_full_rs.apply(get_organization, axis=1 )
143
 
144
  df_full_rs['License'] = '' # License ์—ด ์ถ”๊ฐ€
145
  df_full_rs['License'] = df_full_rs['model'].apply(get_license)
146
 
147
+ df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False)
148
  df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
 
149
 
150
  plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
151
 
152
+ df_full_rs = df_full_rs.apply(get_link, axis=1)
153
+ df_full_rs = df_full_rs.apply(add_link, axis=1)
154
+
155
+ df_full_rs = df_full_rs.drop(columns=['Ko-Bench', 'link', 'organization'])
156
 
157
  # dataframe
158
  df_rs['MT-Bench'] = '' # MT-Bench ์—ด ์ถ”๊ฐ€
 
170
  df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
171
  df_openai = df_openai.round(2)
172
 
173
+ df_openai = df_openai.apply(get_link, axis=1)
174
+ df_openai = df_openai.apply(add_link, axis=1)
175
+ df_openai = df_openai.drop(columns=['link', 'organization'])
176
+
177
  df_openai = df_openai.sort_values(by='score', ascending=False)
178
  df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
179
 
 
186
  df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
187
  df_keval = df_keval.round(2)
188
 
189
+ df_keval = df_keval.apply(get_link, axis=1)
190
+ df_keval = df_keval.apply(add_link, axis=1)
191
+ df_keval = df_keval.drop(columns=['link', 'organization'])
192
+
193
  df_keval = df_keval.sort_values(by='score', ascending=False)
194
  df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
195
 
 
269
  return fig
270
 
271
  def search_openai_plot(dropdown_model): # openai plot ํ•จ์ˆ˜ ์ •์˜
272
+ openai_top_model = df_openai.iat[0, df_openai.columns.get_loc('model')]
273
+ openai_top_model = BeautifulSoup(openai_top_model, 'html.parser').get_text()
274
+
275
+ condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == openai_top_model)
276
  top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
277
 
278
+ condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == openai_top_model)
279
  top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
280
 
281
  condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
 
285
  openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
286
 
287
  category_labels = []
288
+ category_labels.append(openai_top_model + " /Turn 1")
289
+ category_labels.append(openai_top_model + " /Turn 2")
290
  category_labels.append(dropdown_model + " /Turn 1")
291
  category_labels.append(dropdown_model + " /Turn 2")
292
 
 
294
  return fig
295
 
296
  def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
297
+ keval_top_model = df_keval.iat[0, df_keval.columns.get_loc('model')]
298
+ keval_top_model = BeautifulSoup(keval_top_model, 'html.parser').get_text()
299
+
300
+ condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == keval_top_model)
301
  top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
302
 
303
+ condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == keval_top_model)
304
  top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
305
 
306
  condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
 
310
  keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
311
 
312
  category_labels = []
313
+ category_labels.append(keval_top_model + " /Turn 1")
314
+ category_labels.append(keval_top_model + " /Turn 2")
315
  category_labels.append(dropdown_model + " /Turn 1")
316
  category_labels.append(dropdown_model + " /Turn 2")
317
 
 
319
  return fig
320
 
321
 
322
+ # average
323
+ def plot_average():
324
+ fig = go.Figure()
325
+ colors = [px.colors.qualitative.Set2, px.colors.qualitative.Pastel2]
326
+ turn_df = df_full_rs
327
+
328
+ # gpt-4o
329
+ fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/openai'], mode='lines+markers',
330
+ name=f'gpt-4o(Average)',
331
+ line=dict(color=colors[0][0], dash='dash'),
332
+ marker=dict(symbol='x', size=10)))
333
+
334
+ # keval
335
+ fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/keval'], mode='lines+markers',
336
+ name=f'keval(Average)',
337
+ line=dict(color=colors[0][1]),
338
+ marker=dict(symbol='circle', size=10)))
339
+
340
+ fig.update_layout(
341
+ title=f'Comparison of OpenAI ko_bench and keval ko_bench (Average)',
342
+ xaxis_title='Model',
343
+ yaxis_title='Score',
344
+ legend_title='Metric',
345
+ hovermode='x unified',
346
+ template='plotly_white'
347
+ )
348
+ fig.update_yaxes(range=[0, 10])
349
+ fig.update_layout(legend_traceorder="reversed")
350
+ return fig
351
+
352
+
353
  #gradio
354
+ with gr.Blocks(css='assets/leaderboard.css') as demo:
355
  gr.Markdown("")
356
+ gr.Markdown("# ๐Ÿ† Ko-Bench Leaderboard")
357
  gr.Markdown("")
358
+ gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
 
359
  gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
360
+ gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
361
+ gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
 
362
  gr.Markdown("")
363
+ gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
364
+ gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
365
  gr.Markdown("")
366
 
367
+ with gr.Row():
368
+ with gr.TabItem("Ko-Bench"):
369
+ gr.Dataframe(value=df_full_rs,
370
+ datatype=['html' if col == 'model' else 'markdown' for col in df_full_rs.columns])
 
 
 
 
371
  with gr.Row():
372
+ with gr.TabItem("Average"):
373
+ gr.Plot(plot_average)
374
+ with gr.TabItem("Openai Judgment"):
375
+ gr.Dataframe(value=df_openai,
376
+ datatype=['html' if col == 'model' else 'markdown' for col in df_openai.columns])
377
+ with gr.TabItem("Keval Judgment"):
378
+ gr.Dataframe(value=df_keval,
379
+ datatype=['html' if col == 'model' else 'markdown' for col in df_keval.columns])
380
+ with gr.TabItem("Model Detail View"):
381
+ with gr.Blocks():
382
+ with gr.Row():
383
+ dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
384
+ with gr.Row():
385
+ dataframe = gr.Dataframe(label="Model Detail View")
386
+ dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
387
+ with gr.Row():
388
+ plot_openai = gr.Plot(label="Openai Plot")
389
+ dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
390
+ plot_keval = gr.Plot(label="Keval Plot")
391
+ dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
392
+
393
+
394
+
395
+ demo.launch(share=True, server_name="0.0.0.0", debug=True)
ko_bench.csv CHANGED
@@ -1,89 +1,93 @@
1
  judge_model,turn,model,score,Coding,Extraction,Humanities,Math,Reasoning,Roleplay,STEM,Writing
2
- gpt-4o,1,GPT-4o-2024-05-13,9.4,8.7,9.6,9.6,9.9,9.0,9.2,9.7,9.3
3
- gpt-4o,1,gpt-4-0125-preview,8.9,7.7,9.8,9.1,9.7,7.8,9.2,8.7,9.4
4
- gpt-4o,1,GPT-4o-mini-2024-07-18,8.8,7.3,9.2,9.4,10.0,6.9,8.7,9.6,9.1
5
- gpt-4o,1,claude-3-5-sonnet-20240620,8.6,8.1,9.7,9.3,8.7,5.8,8.2,9.4,9.5
6
- gpt-4o,1,Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
7
- gpt-4o,1,Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
8
- gpt-4o,1,gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
9
- gpt-4o,1,gemini-1.5-pro,8.2,5.5,9.7,8.7,7.5,6.5,9.1,9.4,9.2
10
- gpt-4o,1,ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
11
- gpt-4o,1,gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
12
- gpt-4o,1,WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
13
- gpt-4o,1,gpt-3.5-turbo-0125,6.7,5.2,9.0,7.7,6.4,3.3,7.2,6.5,8.6
14
- gpt-4o,1,Meta-Llama-3.1-70B-Instruct,6.6,6.4,8.7,8.0,4.5,4.0,7.9,7.4,5.9
15
- gpt-4o,1,Qwen2-7B-Instruct,6.5,3.9,9.0,8.0,5.6,3.6,7.0,6.6,8.2
16
- gpt-4o,1,EXAONE-3.0-7.8B-Instruct,6.2,4.9,7.4,7.1,7.3,5.1,6.4,4.1,7.6
17
- gpt-4o,1,Qwen1.5-32B-Chat,6.1,4.0,8.6,8.5,4.7,2.6,6.3,7.5,6.7
18
- gpt-4o,1,KONI-Llama3-8B-Instruct-20240729,5.8,3.5,5.0,8.5,5.4,3.2,5.4,7.5,7.6
19
- gpt-4o,1,Ko-Llama-3-8B-Instruct,5.7,4.6,7.0,7.7,2.8,2.5,6.2,6.9,7.6
20
- gpt-4o,1,Meta-Llama-3.1-8B-Instruct,5.4,4.6,7.4,6.3,5.2,3.3,5.2,5.4,6.0
21
- gpt-4o,1,Qwen1.5-14B-Chat,5.4,3.3,7.2,6.8,4.2,2.0,5.7,6.7,7.2
22
- gpt-4o,1,WizardLM-13B-V1.2,4.8,3.4,8.2,6.1,2.2,3.4,5.0,4.3,6.1
23
- gpt-4o,1,Mistral-7B-Instruct-v0.2,2.6,3.0,3.7,2.0,1.7,1.3,4.5,1.4,3.1
24
- gpt-4o,2,GPT-4o-2024-05-13,8.3,7.9,8.9,9.2,8.1,7.0,8.9,8.7,7.5
25
- gpt-4o,2,gpt-4-0125-preview,8.0,7.2,8.5,8.9,6.8,7.3,8.7,8.1,8.6
26
- gpt-4o,2,GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
27
- gpt-4o,2,Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
28
- gpt-4o,2,gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
29
- gpt-4o,2,gemini-1.5-pro,7.0,6.3,7.7,8.3,6.1,5.0,8.5,7.8,6.5
30
- gpt-4o,2,claude-3-5-sonnet-20240620,6.9,6.0,9.0,7.3,6.2,5.8,7.3,6.5,7.5
31
- gpt-4o,2,Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
32
- gpt-4o,2,ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
33
- gpt-4o,2,WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
34
- gpt-4o,2,gemma-2-9b-it,6.2,4.8,7.6,8.3,4.9,3.9,7.0,7.4,6.1
35
- gpt-4o,2,Qwen1.5-32B-Chat,5.8,4.3,8.2,7.6,3.8,3.0,6.8,5.9,6.9
36
- gpt-4o,2,Meta-Llama-3.1-70B-Instruct,5.7,5.5,8.0,7.4,3.6,2.9,6.6,5.7,5.7
37
- gpt-4o,2,EXAONE-3.0-7.8B-Instruct,5.6,5.8,6.2,5.5,7.0,4.0,5.7,4.3,6.5
38
- gpt-4o,2,gpt-3.5-turbo-0125,5.4,5.8,5.7,7.2,4.4,3.0,6.6,4.4,6.4
39
- gpt-4o,2,Qwen2-7B-Instruct,5.3,5.0,7.0,6.6,5.1,2.7,5.6,4.8,5.9
40
- gpt-4o,2,Qwen1.5-14B-Chat,4.9,3.5,5.1,7.4,4.1,2.7,5.9,5.0,5.9
41
- gpt-4o,2,KONI-Llama3-8B-Instruct-20240729,4.5,3.3,3.8,7.6,4.9,2.1,5.6,5.7,3.3
42
- gpt-4o,2,Mistral-7B-Instruct-v0.2,4.5,3.9,4.4,6.8,2.2,2.4,6.2,5.6,4.6
43
- gpt-4o,2,Ko-Llama-3-8B-Instruct,4.0,3.7,4.3,6.4,2.8,2.3,4.9,4.0,4.1
44
- gpt-4o,2,Meta-Llama-3.1-8B-Instruct,3.9,4.1,5.0,4.8,3.8,2.1,4.0,3.5,3.6
45
- gpt-4o,2,WizardLM-13B-V1.2,3.0,2.6,3.5,3.6,1.8,2.3,3.7,3.3,2.8
46
- keval,1,GPT-4o-2024-05-13,9.1,7.8,9.5,9.6,9.9,8.8,8.7,9.3,9.2
47
- keval,1,gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
48
- keval,1,GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
49
- keval,1,claude-3-5-sonnet-20240620,8.4,8.1,9.8,8.7,8.3,5.8,7.9,9.2,9.0
50
- keval,1,Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
51
- keval,1,gemini-1.5-pro,8.2,5.7,9.8,8.8,7.4,6.2,9.1,9.7,9.0
52
- keval,1,gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
53
- keval,1,Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
54
- keval,1,ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
55
- keval,1,gemma-2-9b-it,7.6,6.7,8.8,8.5,5.2,5.5,9.0,8.6,8.5
56
- keval,1,Meta-Llama-3.1-70B-Instruct,7.3,6.8,9.0,8.3,5.9,5.1,8.4,8.0,7.1
57
- keval,1,Qwen1.5-14B-Chat,7.2,4.7,9.7,8.8,4.5,4.8,8.1,8.9,8.4
58
- keval,1,EXAONE-3.0-7.8B-Instruct,7.2,5.7,8.8,8.1,8.2,6.0,7.7,5.6,7.3
59
- keval,1,WizardLM-2-8x22B,7.1,6.1,5.6,7.9,8.8,5.9,6.5,8.7,7.1
60
- keval,1,Qwen1.5-32B-Chat,7.0,3.9,9.9,8.9,5.8,3.6,7.1,8.6,7.9
61
- keval,1,gpt-3.5-turbo-0125,6.9,5.6,8.9,7.7,6.4,3.2,7.4,7.5,8.6
62
- keval,1,KONI-Llama3-8B-Instruct-20240729,6.8,3.4,8.6,8.5,5.5,4.1,6.9,8.8,8.4
63
- keval,1,Qwen2-7B-Instruct,6.4,3.6,9.0,7.7,5.5,3.5,7.1,6.7,8.4
64
- keval,1,Meta-Llama-3.1-8B-Instruct,6.3,4.3,8.9,7.7,5.3,3.3,7.3,6.0,7.5
65
- keval,1,Ko-Llama-3-8B-Instruct,6.0,5.0,7.4,7.6,2.9,2.9,7.0,8.0,7.6
66
- keval,1,WizardLM-13B-V1.2,6.0,3.7,9.3,7.7,2.4,3.8,7.0,6.6,7.7
67
- keval,1,Mistral-7B-Instruct-v0.2,3.0,3.0,6.7,3.0,2.0,2.0,3.3,1.9,2.4
68
- keval,2,GPT-4o-2024-05-13,8.1,7.7,8.9,9.2,7.8,6.9,8.4,8.7,7.4
69
- keval,2,gpt-4-0125-preview,7.7,6.3,8.4,8.8,6.9,6.3,8.6,8.6,8.0
70
- keval,2,GPT-4o-mini-2024-07-18,7.4,6.8,7.6,8.7,7.7,4.3,7.8,8.4,7.8
71
- keval,2,Mistral-Large-Instruct-2407,7.0,5.4,7.3,8.5,7.3,5.2,7.9,7.8,6.9
72
- keval,2,Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
73
- keval,2,gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
74
- keval,2,claude-3-5-sonnet-20240620,6.8,6.2,8.4,7.8,5.4,5.1,7.0,7.3,7.5
75
- keval,2,WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
76
- keval,2,gemini-1.5-pro,6.5,5.2,6.9,8.4,6.0,4.8,8.1,7.3,5.4
77
- keval,2,ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
78
- keval,2,gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
79
- keval,2,EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8
80
- keval,2,Qwen1.5-32B-Chat,6.2,5.2,7.7,8.0,4.1,4.0,7.7,6.7,6.5
81
- keval,2,Qwen1.5-14B-Chat,6.0,4.7,6.9,7.9,4.8,3.8,7.2,6.3,6.7
82
- keval,2,Meta-Llama-3.1-70B-Instruct,6.0,6.0,7.3,7.6,5.6,2.9,7.0,6.2,5.6
83
- keval,2,Qwen2-7B-Instruct,5.6,4.9,7.0,6.5,5.1,3.1,6.3,5.0,6.5
84
- keval,2,KONI-Llama3-8B-Instruct-20240729,5.5,4.6,4.9,6.7,5.9,3.2,6.9,6.8,5.2
85
- keval,2,gpt-3.5-turbo-0125,5.3,6.2,5.5,7.0,4.5,3.3,6.2,4.5,5.4
86
- keval,2,Meta-Llama-3.1-8B-Instruct,4.8,5.0,6.0,5.5,4.4,2.6,5.9,5.0,4.4
87
- keval,2,Ko-Llama-3-8B-Instruct,4.2,3.6,4.6,6.3,2.8,2.2,6.1,3.7,4.3
88
- keval,2,WizardLM-13B-V1.2,4.1,3.7,5.4,5.8,2.8,3.0,5.6,3.3,3.4
89
- keval,2,Mistral-7B-Instruct-v0.2,4.1,3.5,6.1,6.3,2.6,2.2,3.5,3.2,5.5
 
 
 
 
 
1
  judge_model,turn,model,score,Coding,Extraction,Humanities,Math,Reasoning,Roleplay,STEM,Writing
2
+ gpt-4o,1,openai__GPT-4o-2024-05-13,9.4,8.7,9.6,9.6,9.9,9.0,9.2,9.7,9.3
3
+ gpt-4o,1,Anthropic__claude-3-5-sonnet-20240620,9.0,6.7,9.5,9.2,9.6,9.3,8.7,9.8,9.0
4
+ gpt-4o,1,openai__gpt-4-0125-preview,8.9,7.7,9.8,9.1,9.7,7.8,9.2,8.7,9.4
5
+ gpt-4o,1,openai__GPT-4o-mini-2024-07-18,8.8,7.3,9.2,9.4,10.0,6.9,8.7,9.6,9.1
6
+ gpt-4o,1,Anthropic__claude-3-opus-20240229,8.6,8.1,9.7,9.3,8.7,5.8,8.2,9.4,9.5
7
+ gpt-4o,1,mistralai__Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
8
+ gpt-4o,1,Qwen__Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
9
+ gpt-4o,1,google__gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
10
+ gpt-4o,1,google__gemini-1.5-pro,8.2,5.5,9.7,8.7,7.5,6.5,9.1,9.4,9.2
11
+ gpt-4o,1,davidkim205__ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
12
+ gpt-4o,1,google__gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
13
+ gpt-4o,1,alpindale__WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
14
+ gpt-4o,1,openai__gpt-3.5-turbo-0125,6.7,5.2,9.0,7.7,6.4,3.3,7.2,6.5,8.6
15
+ gpt-4o,1,meta-llama__Meta-Llama-3.1-70B-Instruct,6.6,6.4,8.7,8.0,4.5,4.0,7.9,7.4,5.9
16
+ gpt-4o,1,Qwen__Qwen2-7B-Instruct,6.5,3.9,9.0,8.0,5.6,3.6,7.0,6.6,8.2
17
+ gpt-4o,1,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,6.2,4.9,7.4,7.1,7.3,5.1,6.4,4.1,7.6
18
+ gpt-4o,1,Qwen__Qwen1.5-32B-Chat,6.1,4.0,8.6,8.5,4.7,2.6,6.3,7.5,6.7
19
+ gpt-4o,1,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,5.8,3.5,5.0,8.5,5.4,3.2,5.4,7.5,7.6
20
+ gpt-4o,1,davidkim205__Ko-Llama-3-8B-Instruct,5.7,4.6,7.0,7.7,2.8,2.5,6.2,6.9,7.6
21
+ gpt-4o,1,meta-llama__Meta-Llama-3.1-8B-Instruct,5.4,4.6,7.4,6.3,5.2,3.3,5.2,5.4,6.0
22
+ gpt-4o,1,Qwen__Qwen1.5-14B-Chat,5.4,3.3,7.2,6.8,4.2,2.0,5.7,6.7,7.2
23
+ gpt-4o,1,WizardLMTeam__WizardLM-13B-V1.2,4.8,3.4,8.2,6.1,2.2,3.4,5.0,4.3,6.1
24
+ gpt-4o,1,mistralai__Mistral-7B-Instruct-v0.2,2.6,3.0,3.7,2.0,1.7,1.3,4.5,1.4,3.1
25
+ gpt-4o,2,openai__GPT-4o-2024-05-13,8.3,7.9,8.9,9.2,8.1,7.0,8.9,8.7,7.5
26
+ gpt-4o,2,openai__gpt-4-0125-preview,8.0,7.2,8.5,8.9,6.8,7.3,8.7,8.1,8.6
27
+ gpt-4o,2,Anthropic__claude-3-5-sonnet-20240620,7.9,6.9,9.1,9.0,6.4,6.9,8.1,8.2,8.4
28
+ gpt-4o,2,openai__GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
29
+ gpt-4o,2,mistralai__Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
30
+ gpt-4o,2,google__gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
31
+ gpt-4o,2,google__gemini-1.5-pro,7.0,6.3,7.7,8.3,6.1,5.0,8.5,7.8,6.5
32
+ gpt-4o,2,Anthropic__claude-3-opus-20240229,6.9,6.0,9.0,7.3,6.2,5.8,7.3,6.5,7.5
33
+ gpt-4o,2,Qwen__Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
34
+ gpt-4o,2,davidkim205__ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
35
+ gpt-4o,2,alpindale__WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
36
+ gpt-4o,2,google__gemma-2-9b-it,6.2,4.8,7.6,8.3,4.9,3.9,7.0,7.4,6.1
37
+ gpt-4o,2,Qwen__Qwen1.5-32B-Chat,5.8,4.3,8.2,7.6,3.8,3.0,6.8,5.9,6.9
38
+ gpt-4o,2,meta-llama__Meta-Llama-3.1-70B-Instruct,5.7,5.5,8.0,7.4,3.6,2.9,6.6,5.7,5.7
39
+ gpt-4o,2,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,5.6,5.8,6.2,5.5,7.0,4.0,5.7,4.3,6.5
40
+ gpt-4o,2,openai__gpt-3.5-turbo-0125,5.4,5.8,5.7,7.2,4.4,3.0,6.6,4.4,6.4
41
+ gpt-4o,2,Qwen__Qwen2-7B-Instruct,5.3,5.0,7.0,6.6,5.1,2.7,5.6,4.8,5.9
42
+ gpt-4o,2,Qwen__Qwen1.5-14B-Chat,4.9,3.5,5.1,7.4,4.1,2.7,5.9,5.0,5.9
43
+ gpt-4o,2,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,4.5,3.3,3.8,7.6,4.9,2.1,5.6,5.7,3.3
44
+ gpt-4o,2,mistralai__Mistral-7B-Instruct-v0.2,4.5,3.9,4.4,6.8,2.2,2.4,6.2,5.6,4.6
45
+ gpt-4o,2,davidkim205__Ko-Llama-3-8B-Instruct,4.0,3.7,4.3,6.4,2.8,2.3,4.9,4.0,4.1
46
+ gpt-4o,2,meta-llama__Meta-Llama-3.1-8B-Instruct,3.9,4.1,5.0,4.8,3.8,2.1,4.0,3.5,3.6
47
+ gpt-4o,2,WizardLMTeam__WizardLM-13B-V1.2,3.0,2.6,3.5,3.6,1.8,2.3,3.7,3.3,2.8
48
+ keval,1,openai__GPT-4o-2024-05-13,9.1,7.8,9.5,9.6,9.9,8.8,8.7,9.3,9.2
49
+ keval,1,Anthropic__claude-3-5-sonnet-20240620,9.0,7.2,9.8,9.2,9.3,9.2,8.9,9.4,9.0
50
+ keval,1,openai__gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
51
+ keval,1,openai__GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
52
+ keval,1,Anthropic__claude-3-opus-20240229,8.4,8.1,9.8,8.7,8.3,5.8,7.9,9.2,9.0
53
+ keval,1,mistralai__Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
54
+ keval,1,google__gemini-1.5-pro,8.2,5.7,9.8,8.8,7.4,6.2,9.1,9.7,9.0
55
+ keval,1,google__gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
56
+ keval,1,Qwen__Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
57
+ keval,1,davidkim205__ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
58
+ keval,1,google__gemma-2-9b-it,7.6,6.7,8.8,8.5,5.2,5.5,9.0,8.6,8.5
59
+ keval,1,meta-llama__Meta-Llama-3.1-70B-Instruct,7.3,6.8,9.0,8.3,5.9,5.1,8.4,8.0,7.1
60
+ keval,1,Qwen__Qwen1.5-14B-Chat,7.2,4.7,9.7,8.8,4.5,4.8,8.1,8.9,8.4
61
+ keval,1,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,7.2,5.7,8.8,8.1,8.2,6.0,7.7,5.6,7.3
62
+ keval,1,alpindale__WizardLM-2-8x22B,7.1,6.1,5.6,7.9,8.8,5.9,6.5,8.7,7.1
63
+ keval,1,Qwen__Qwen1.5-32B-Chat,7.0,3.9,9.9,8.9,5.8,3.6,7.1,8.6,7.9
64
+ keval,1,openai__gpt-3.5-turbo-0125,6.9,5.6,8.9,7.7,6.4,3.2,7.4,7.5,8.6
65
+ keval,1,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,6.8,3.4,8.6,8.5,5.5,4.1,6.9,8.8,8.4
66
+ keval,1,Qwen__Qwen2-7B-Instruct,6.4,3.6,9.0,7.7,5.5,3.5,7.1,6.7,8.4
67
+ keval,1,meta-llama__Meta-Llama-3.1-8B-Instruct,6.3,4.3,8.9,7.7,5.3,3.3,7.3,6.0,7.5
68
+ keval,1,davidkim205__Ko-Llama-3-8B-Instruct,6.0,5.0,7.4,7.6,2.9,2.9,7.0,8.0,7.6
69
+ keval,1,WizardLMTeam__WizardLM-13B-V1.2,6.0,3.7,9.3,7.7,2.4,3.8,7.0,6.6,7.7
70
+ keval,1,mistralai__Mistral-7B-Instruct-v0.2,3.0,3.0,6.7,3.0,2.0,2.0,3.3,1.9,2.4
71
+ keval,2,openai__GPT-4o-2024-05-13,8.1,7.7,8.9,9.2,7.8,6.9,8.4,8.7,7.4
72
+ keval,2,openai__gpt-4-0125-preview,7.7,6.3,8.4,8.8,6.9,6.3,8.6,8.6,8.0
73
+ keval,2,openai__GPT-4o-mini-2024-07-18,7.4,6.8,7.6,8.7,7.7,4.3,7.8,8.4,7.8
74
+ keval,2,Anthropic__claude-3-5-sonnet-20240620,7.3,6.6,7.6,9.0,6.6,5.7,7.6,8.1,7.1
75
+ keval,2,mistralai__Mistral-Large-Instruct-2407,7.0,5.4,7.3,8.5,7.3,5.2,7.9,7.8,6.9
76
+ keval,2,Qwen__Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
77
+ keval,2,google__gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
78
+ keval,2,Anthropic__claude-3-opus-20240229,6.8,6.2,8.4,7.8,5.4,5.1,7.0,7.3,7.5
79
+ keval,2,alpindale__WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
80
+ keval,2,google__gemini-1.5-pro,6.5,5.2,6.9,8.4,6.0,4.8,8.1,7.3,5.4
81
+ keval,2,davidkim205__ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
82
+ keval,2,google__gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
83
+ keval,2,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8
84
+ keval,2,Qwen__Qwen1.5-32B-Chat,6.2,5.2,7.7,8.0,4.1,4.0,7.7,6.7,6.5
85
+ keval,2,Qwen__Qwen1.5-14B-Chat,6.0,4.7,6.9,7.9,4.8,3.8,7.2,6.3,6.7
86
+ keval,2,meta-llama__Meta-Llama-3.1-70B-Instruct,6.0,6.0,7.3,7.6,5.6,2.9,7.0,6.2,5.6
87
+ keval,2,Qwen__Qwen2-7B-Instruct,5.6,4.9,7.0,6.5,5.1,3.1,6.3,5.0,6.5
88
+ keval,2,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,5.5,4.6,4.9,6.7,5.9,3.2,6.9,6.8,5.2
89
+ keval,2,openai__gpt-3.5-turbo-0125,5.3,6.2,5.5,7.0,4.5,3.3,6.2,4.5,5.4
90
+ keval,2,meta-llama__Meta-Llama-3.1-8B-Instruct,4.8,5.0,6.0,5.5,4.4,2.6,5.9,5.0,4.4
91
+ keval,2,davidkim205__Ko-Llama-3-8B-Instruct,4.2,3.6,4.6,6.3,2.8,2.2,6.1,3.7,4.3
92
+ keval,2,WizardLMTeam__WizardLM-13B-V1.2,4.1,3.7,5.4,5.8,2.8,3.0,5.6,3.3,3.4
93
+ keval,2,mistralai__Mistral-7B-Instruct-v0.2,4.1,3.5,6.1,6.3,2.6,2.2,3.5,3.2,5.5