davidkim205 commited on
Commit
d05d8fb
โ€ข
1 Parent(s): 2d3d046

update ko_bench

Browse files
Files changed (2) hide show
  1. app.py +71 -49
  2. ko_bench.csv +4 -0
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
- import plotly.express as px
5
  import random
6
  import plotly.graph_objects as go
7
 
@@ -62,7 +61,7 @@ def get_license(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„
62
 
63
  # dataframe_full
64
  df_full_rs = df_rs.copy()
65
- df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True)
66
  df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
67
 
68
  df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
@@ -70,16 +69,16 @@ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean
70
  df_full_rs = df_full_rs.round(2)
71
  df_full_rs.replace("", np.nan, inplace=True)
72
 
73
- df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval ์—ด ์ถ”๊ฐ€
74
- df_full_rs['Ko-Bench/keval'] = ''
75
  for idx, j_model in df_full_rs['judge_model'].items():
76
  if j_model == 'keval':
77
- df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench']
78
  else :
79
- df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench']
80
  df_full_rs = df_full_rs.drop(columns=['judge_model'])
81
 
82
- df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval ํ–‰ ํ•ฉ๋ณ‘
83
  df_full_rs = df_full_rs.round(2)
84
  df_full_rs.replace("", np.nan, inplace=True)
85
 
@@ -93,9 +92,9 @@ df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
93
  df_full_rs['License'] = '' # License ์—ด ์ถ”๊ฐ€
94
  df_full_rs['License'] = df_full_rs['model'].apply(get_license)
95
 
96
- df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False)
97
  df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
98
- df_full_rs = df_full_rs.drop(columns=['Ko-Bench'])
99
 
100
  plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
101
 
@@ -135,7 +134,8 @@ df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
135
  # model detail view
136
  plot_models_list = plot_models.tolist()
137
  CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
138
- category_labels = ['Selected model turn1', 'Selected model turn2', 'Top1 turn1', 'Top1 turn2']
 
139
  random.seed(42)
140
 
141
  def search_dataframe(query): # df ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ์ •์˜
@@ -144,32 +144,36 @@ def search_dataframe(query): # df ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ์ •์˜
144
  filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)]
145
  return filtered_df
146
 
147
- def radar_chart(categories, Selected_model_turn1, Selected_model_turn2, Top1_turn1, Top1_turn2): # plot ๊ทธ๋ฆฌ๋Š” ํ•จ์ˆ˜
148
  #categories = categories.split(',')
149
-
150
- Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist]
151
- Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist]
152
  Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist]
153
  Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist]
 
 
154
 
155
  values_lists = [
156
- list(map(float, Selected_model_turn1)),
157
- list(map(float, Selected_model_turn2)),
158
  list(map(float, Top1_turn1)),
159
- list(map(float, Top1_turn2))
 
 
160
  ]
161
 
 
 
 
 
 
162
  fig = go.Figure()
163
 
164
  for i, values in enumerate(values_lists):
165
  if len(categories) != len(values):
166
  return f"Error in dataset {i+1}: Number of categories and values must be the same."
167
-
168
  fig.add_trace(go.Scatterpolar(
169
  r=values + [values[0]], # Closing the loop of the radar chart
170
  theta=categories + [categories[0]], # Closing the loop of the radar chart
171
  mode='lines',
172
- name=category_labels[i] # Label for the dataset
 
173
  ))
174
 
175
  fig.update_layout(
@@ -185,63 +189,82 @@ def radar_chart(categories, Selected_model_turn1, Selected_model_turn2, Top1_tur
185
  )
186
  ),
187
  showlegend=True,
188
- width=555, # ์ ์ ˆํ•œ ๋„ˆ๋น„ ์„ค์ •
189
- height=550, # ์ ์ ˆํ•œ ๋†’์ด ์„ค์ •
190
  margin=dict(l=1000, r=20, t=20, b=20),
191
- autosize = False,
192
  paper_bgcolor='white',
193
- plot_bgcolor='lightgrey'
 
 
 
 
 
 
 
194
  )
195
  return fig
196
 
197
  def search_openai_plot(dropdown_model): # openai plot ํ•จ์ˆ˜ ์ •์˜
198
- condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
199
- openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
 
 
 
200
 
201
- condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
202
- openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
203
 
204
- condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == df_openai.loc[0,'model'])
205
- top1_openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
206
 
207
- condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == df_openai.loc[0,'model'])
208
- top1_openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
 
 
 
209
 
210
- fig = radar_chart(CATEGORIES, openai_turn1, openai_turn2, top1_openai_turn1, top1_openai_turn2)
211
  return fig
212
 
213
  def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
214
- condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
215
- keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
216
 
217
- condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
218
- keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
219
 
220
- condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == df_keval.loc[0,'model'])
221
- top1_keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
222
 
223
- condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == df_keval.loc[0,'model'])
224
- top1_keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
225
 
226
- fig = radar_chart(CATEGORIES, keval_turn1, keval_turn2, top1_keval_turn1, top1_keval_turn2)
 
 
 
 
 
 
227
  return fig
228
 
229
 
230
  #gradio
231
  with gr.Blocks() as demo:
232
  gr.Markdown("")
233
- gr.Markdown("# ๐Ÿ† Ko-Bench Leaderboard")
 
234
  gr.Markdown("")
235
- gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
236
  gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
237
- gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
238
- gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
 
239
  gr.Markdown("")
240
- gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
241
- gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
242
  gr.Markdown("")
243
 
244
- with gr.TabItem("Ko-Bench"):
245
  gr.Dataframe(value=df_full_rs)
246
  with gr.TabItem("Openai Judgment"):
247
  gr.Dataframe(value=df_openai)
@@ -257,7 +280,6 @@ with gr.Blocks() as demo:
257
  with gr.Row():
258
  plot_openai = gr.Plot(label="Openai Plot")
259
  dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
260
- #with gr.Row():
261
  plot_keval = gr.Plot(label="Keval Plot")
262
  dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
263
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
 
4
  import random
5
  import plotly.graph_objects as go
6
 
 
61
 
62
  # dataframe_full
63
  df_full_rs = df_rs.copy()
64
+ df_full_rs.rename(columns={'score': 'KO-Bench'}, inplace=True)
65
  df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
66
 
67
  df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
 
69
  df_full_rs = df_full_rs.round(2)
70
  df_full_rs.replace("", np.nan, inplace=True)
71
 
72
+ df_full_rs['KO-Bench/openai'] = '' # KO-Bench/openai, KO-Bench/keval ์—ด ์ถ”๊ฐ€
73
+ df_full_rs['KO-Bench/keval'] = ''
74
  for idx, j_model in df_full_rs['judge_model'].items():
75
  if j_model == 'keval':
76
+ df_full_rs.at[idx, 'KO-Bench/keval'] = df_full_rs.at[idx, 'KO-Bench']
77
  else :
78
+ df_full_rs.at[idx, 'KO-Bench/openai'] = df_full_rs.at[idx, 'KO-Bench']
79
  df_full_rs = df_full_rs.drop(columns=['judge_model'])
80
 
81
+ df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # KO-Bench/openai, KO-Bench/keval ํ–‰ ํ•ฉ๋ณ‘
82
  df_full_rs = df_full_rs.round(2)
83
  df_full_rs.replace("", np.nan, inplace=True)
84
 
 
92
  df_full_rs['License'] = '' # License ์—ด ์ถ”๊ฐ€
93
  df_full_rs['License'] = df_full_rs['model'].apply(get_license)
94
 
95
+ df_full_rs = df_full_rs.sort_values(by='KO-Bench', ascending=False)
96
  df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
97
+ df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
98
 
99
  plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
100
 
 
134
  # model detail view
135
  plot_models_list = plot_models.tolist()
136
  CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
137
+ colors_openai = ['#ff0000', '#ff1493', '#115e02', '#21ad05']
138
+ colors_keval = ['#ff0000', '#ff1493', '#0000ff', '#0592eb']
139
  random.seed(42)
140
 
141
  def search_dataframe(query): # df ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ์ •์˜
 
144
  filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)]
145
  return filtered_df
146
 
147
+ def radar_chart(categories, Top1_turn1, Top1_turn2, Selected_model_turn1, Selected_model_turn2, category_labels, str): # plot ๊ทธ๋ฆฌ๋Š” ํ•จ์ˆ˜
148
  #categories = categories.split(',')
 
 
 
149
  Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist]
150
  Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist]
151
+ Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist]
152
+ Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist]
153
 
154
  values_lists = [
 
 
155
  list(map(float, Top1_turn1)),
156
+ list(map(float, Top1_turn2)),
157
+ list(map(float, Selected_model_turn1)),
158
+ list(map(float, Selected_model_turn2))
159
  ]
160
 
161
+ if str == "openai": colors = colors_openai
162
+ else: colors = colors_keval
163
+ if str == "openai": title_text = "< Openai >"
164
+ else: title_text = "< Keval >"
165
+
166
  fig = go.Figure()
167
 
168
  for i, values in enumerate(values_lists):
169
  if len(categories) != len(values):
170
  return f"Error in dataset {i+1}: Number of categories and values must be the same."
 
171
  fig.add_trace(go.Scatterpolar(
172
  r=values + [values[0]], # Closing the loop of the radar chart
173
  theta=categories + [categories[0]], # Closing the loop of the radar chart
174
  mode='lines',
175
+ name=category_labels[i], # Label for the dataset
176
+ line = dict(color= colors[i])
177
  ))
178
 
179
  fig.update_layout(
 
189
  )
190
  ),
191
  showlegend=True,
192
+ #width=650, # ์ ์ ˆํ•œ ๋„ˆ๋น„ ์„ค์ •
193
+ #height=650, # ์ ์ ˆํ•œ ๋†’์ด ์„ค์ •
194
  margin=dict(l=1000, r=20, t=20, b=20),
195
+ #autosize = False,
196
  paper_bgcolor='white',
197
+ plot_bgcolor='lightgrey',
198
+ title=dict(
199
+ text=title_text, # ์ œ๋ชฉ์„ ์›ํ•˜๋Š” ํ…์ŠคํŠธ๋กœ ๋ณ€๊ฒฝ
200
+ x=0.5, # ์ œ๋ชฉ์˜ x ์œ„์น˜ (0=์™ผ์ชฝ, 0.5=์ค‘์•™, 1=์˜ค๋ฅธ์ชฝ)
201
+ xanchor='center', # ์ œ๋ชฉ์˜ x ์œ„์น˜ ๊ธฐ์ค€ (center, left, right)
202
+ y=0.95, # ์ œ๋ชฉ์˜ y ์œ„์น˜ (0=ํ•˜๋‹จ, 1=์ƒ๋‹จ)
203
+ yanchor='top' # ์ œ๋ชฉ์˜ y ์œ„์น˜ ๊ธฐ์ค€ (top, middle, bottom)
204
+ )
205
  )
206
  return fig
207
 
208
  def search_openai_plot(dropdown_model): # openai plot ํ•จ์ˆ˜ ์ •์˜
209
+ condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == df_openai.iat[0, df_openai.columns.get_loc('model')])
210
+ top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
211
+
212
+ condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == df_openai.iat[0, df_openai.columns.get_loc('model')])
213
+ top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
214
 
215
+ condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
216
+ openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
217
 
218
+ condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
219
+ openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
220
 
221
+ category_labels = []
222
+ category_labels.append(df_openai.iat[0, df_openai.columns.get_loc('model')] + " /Turn 1")
223
+ category_labels.append(df_openai.iat[0, df_openai.columns.get_loc('model')] + " /Turn 2")
224
+ category_labels.append(dropdown_model + " /Turn 1")
225
+ category_labels.append(dropdown_model + " /Turn 2")
226
 
227
+ fig = radar_chart(CATEGORIES, top1_openai_turn1, top1_openai_turn2, openai_turn1, openai_turn2, category_labels,"openai")
228
  return fig
229
 
230
  def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
231
+ condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == df_keval.iat[0, df_keval.columns.get_loc('model')])
232
+ top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
233
 
234
+ condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == df_keval.iat[0, df_keval.columns.get_loc('model')])
235
+ top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
236
 
237
+ condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
238
+ keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
239
 
240
+ condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
241
+ keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
242
 
243
+ category_labels = []
244
+ category_labels.append(df_keval.iat[0, df_keval.columns.get_loc('model')] + " /Turn 1")
245
+ category_labels.append(df_keval.iat[0, df_keval.columns.get_loc('model')] + " /Turn 2")
246
+ category_labels.append(dropdown_model + " /Turn 1")
247
+ category_labels.append(dropdown_model + " /Turn 2")
248
+
249
+ fig = radar_chart(CATEGORIES, top1_keval_turn1, top1_keval_turn2, keval_turn1, keval_turn2, category_labels, "keval")
250
  return fig
251
 
252
 
253
  #gradio
254
  with gr.Blocks() as demo:
255
  gr.Markdown("")
256
+ gr.Markdown("# ๐Ÿ† KO-Bench Leaderboard")
257
+ gr.Markdown("")
258
  gr.Markdown("")
259
+ gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
260
  gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
261
+ gr.Markdown("- KO-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
262
+ gr.Markdown("- KO-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
263
+ gr.Markdown("")
264
  gr.Markdown("")
 
 
265
  gr.Markdown("")
266
 
267
+ with gr.TabItem("KO-Bench"):
268
  gr.Dataframe(value=df_full_rs)
269
  with gr.TabItem("Openai Judgment"):
270
  gr.Dataframe(value=df_openai)
 
280
  with gr.Row():
281
  plot_openai = gr.Plot(label="Openai Plot")
282
  dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
 
283
  plot_keval = gr.Plot(label="Keval Plot")
284
  dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
285
 
ko_bench.csv CHANGED
@@ -5,6 +5,7 @@ gpt-4o,1,GPT-4o-mini-2024-07-18,8.8,7.3,9.2,9.4,10.0,6.9,8.7,9.6,9.1
5
  gpt-4o,1,Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
6
  gpt-4o,1,Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
7
  gpt-4o,1,gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
 
8
  gpt-4o,1,ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
9
  gpt-4o,1,gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
10
  gpt-4o,1,WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
@@ -24,6 +25,7 @@ gpt-4o,2,gpt-4-0125-preview,8.0,7.2,8.5,8.9,6.8,7.3,8.7,8.1,8.6
24
  gpt-4o,2,GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
25
  gpt-4o,2,Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
26
  gpt-4o,2,gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
 
27
  gpt-4o,2,Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
28
  gpt-4o,2,ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
29
  gpt-4o,2,WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
@@ -43,6 +45,7 @@ keval,1,GPT-4o-2024-05-13,9.1,7.8,9.5,9.6,9.9,8.8,8.7,9.3,9.2
43
  keval,1,gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
44
  keval,1,GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
45
  keval,1,Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
 
46
  keval,1,gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
47
  keval,1,Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
48
  keval,1,ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
@@ -66,6 +69,7 @@ keval,2,Mistral-Large-Instruct-2407,7.0,5.4,7.3,8.5,7.3,5.2,7.9,7.8,6.9
66
  keval,2,Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
67
  keval,2,gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
68
  keval,2,WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
 
69
  keval,2,ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
70
  keval,2,gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
71
  keval,2,EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8
 
5
  gpt-4o,1,Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
6
  gpt-4o,1,Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
7
  gpt-4o,1,gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
8
+ gpt-4o,1,gemini-1.5-pro,8.2,5.5,9.7,8.7,7.5,6.5,9.1,9.4,9.2
9
  gpt-4o,1,ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
10
  gpt-4o,1,gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
11
  gpt-4o,1,WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
 
25
  gpt-4o,2,GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
26
  gpt-4o,2,Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
27
  gpt-4o,2,gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
28
+ gpt-4o,2,gemini-1.5-pro,7.0,6.3,7.7,8.3,6.1,5.0,8.5,7.8,6.5
29
  gpt-4o,2,Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
30
  gpt-4o,2,ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
31
  gpt-4o,2,WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
 
45
  keval,1,gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
46
  keval,1,GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
47
  keval,1,Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
48
+ keval,1,gemini-1.5-pro,8.2,5.7,9.8,8.8,7.4,6.2,9.1,9.7,9.0
49
  keval,1,gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
50
  keval,1,Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
51
  keval,1,ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
 
69
  keval,2,Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
70
  keval,2,gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
71
  keval,2,WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
72
+ keval,2,gemini-1.5-pro,6.5,5.2,6.9,8.4,6.0,4.8,8.1,7.3,5.4
73
  keval,2,ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
74
  keval,2,gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
75
  keval,2,EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8