Spaces:
Running
Running
davidkim205
commited on
Commit
โข
174062d
1
Parent(s):
731e515
add claud-3-5 results
Browse files- app.py +157 -48
- ko_bench.csv +92 -88
app.py
CHANGED
@@ -3,27 +3,43 @@ import pandas as pd
|
|
3 |
import numpy as np
|
4 |
import random
|
5 |
import plotly.graph_objects as go
|
|
|
|
|
6 |
|
7 |
file_result_score = 'ko_bench.csv'
|
8 |
|
9 |
file_full_lb = 'mt_bench_240805.csv'
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# read csv
|
13 |
df_result_score = pd.read_csv(file_result_score)
|
14 |
df_full_lb = pd.read_csv(file_full_lb)
|
15 |
|
16 |
-
|
17 |
# dataframe
|
18 |
df = pd.DataFrame(df_result_score)
|
|
|
|
|
19 |
df_rs = pd.DataFrame(df_result_score)
|
|
|
|
|
|
|
|
|
20 |
df_full_lboard = pd.DataFrame(df_full_lb)
|
21 |
|
22 |
df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋ก ๋ณ๊ฒฝ
|
23 |
-
models = df_full_lboard['Model'].unique() # ์ด ์ถ๊ฐ๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
24 |
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
25 |
|
26 |
def custom_mean(series):
|
|
|
|
|
27 |
numeric_series = pd.to_numeric(series, errors='coerce') # ์๋ฆฌ์ฆ๋ฅผ ์ซ์๋ก ๋ณํ
|
28 |
return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์๋ ๊ฐ์ด ํ๋๋ผ๋ ์์ผ๋ฉด ํ๊ท ๊ณ์ฐ
|
29 |
|
@@ -34,7 +50,8 @@ def get_mt_bench(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์
|
|
34 |
return matching_rows['MT-bench (score)'].values[0]
|
35 |
return ''
|
36 |
|
37 |
-
def get_organization(
|
|
|
38 |
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
|
39 |
return 'Mistral'
|
40 |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
@@ -44,13 +61,32 @@ def get_organization(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ
|
|
44 |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
45 |
if not matching_rows.empty:
|
46 |
return matching_rows['Organization'].values[0]
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
50 |
-
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
|
51 |
return 'Apache-2.0'
|
52 |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
53 |
return 'llama3'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
model_lower = model.lower()
|
56 |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
@@ -58,10 +94,26 @@ def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์
|
|
58 |
return matching_rows['License'].values[0]
|
59 |
return ''
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
# dataframe_full
|
63 |
df_full_rs = df_rs.copy()
|
64 |
-
df_full_rs.rename(columns={'score': '
|
65 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
66 |
|
67 |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
@@ -69,16 +121,16 @@ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean
|
|
69 |
df_full_rs = df_full_rs.round(2)
|
70 |
df_full_rs.replace("", np.nan, inplace=True)
|
71 |
|
72 |
-
df_full_rs['
|
73 |
-
df_full_rs['
|
74 |
for idx, j_model in df_full_rs['judge_model'].items():
|
75 |
if j_model == 'keval':
|
76 |
-
df_full_rs.at[idx, '
|
77 |
else :
|
78 |
-
df_full_rs.at[idx, '
|
79 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
80 |
|
81 |
-
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() #
|
82 |
df_full_rs = df_full_rs.round(2)
|
83 |
df_full_rs.replace("", np.nan, inplace=True)
|
84 |
|
@@ -87,17 +139,20 @@ df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench)
|
|
87 |
df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
|
88 |
|
89 |
df_full_rs['Organization'] = '' # Organization ์ด ์ถ๊ฐ
|
90 |
-
df_full_rs['Organization'] = df_full_rs
|
91 |
|
92 |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
93 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
94 |
|
95 |
-
df_full_rs = df_full_rs.sort_values(by='
|
96 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
97 |
-
df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
|
98 |
|
99 |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
100 |
|
|
|
|
|
|
|
|
|
101 |
|
102 |
# dataframe
|
103 |
df_rs['MT-Bench'] = '' # MT-Bench ์ด ์ถ๊ฐ
|
@@ -115,6 +170,10 @@ df_openai = df_openai.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ turn1,2
|
|
115 |
df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
|
116 |
df_openai = df_openai.round(2)
|
117 |
|
|
|
|
|
|
|
|
|
118 |
df_openai = df_openai.sort_values(by='score', ascending=False)
|
119 |
df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
|
120 |
|
@@ -127,6 +186,10 @@ df_keval = df_keval.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ turn1,2 sc
|
|
127 |
df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
|
128 |
df_keval = df_keval.round(2)
|
129 |
|
|
|
|
|
|
|
|
|
130 |
df_keval = df_keval.sort_values(by='score', ascending=False)
|
131 |
df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
|
132 |
|
@@ -206,10 +269,13 @@ def radar_chart(categories, Top1_turn1, Top1_turn2, Selected_model_turn1, Select
|
|
206 |
return fig
|
207 |
|
208 |
def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
209 |
-
|
|
|
|
|
|
|
210 |
top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
211 |
|
212 |
-
condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] ==
|
213 |
top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
214 |
|
215 |
condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
@@ -219,8 +285,8 @@ def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
|
219 |
openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
220 |
|
221 |
category_labels = []
|
222 |
-
category_labels.append(
|
223 |
-
category_labels.append(
|
224 |
category_labels.append(dropdown_model + " /Turn 1")
|
225 |
category_labels.append(dropdown_model + " /Turn 2")
|
226 |
|
@@ -228,10 +294,13 @@ def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
|
228 |
return fig
|
229 |
|
230 |
def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
231 |
-
|
|
|
|
|
|
|
232 |
top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
233 |
|
234 |
-
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] ==
|
235 |
top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
236 |
|
237 |
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
@@ -241,8 +310,8 @@ def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
|
241 |
keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
242 |
|
243 |
category_labels = []
|
244 |
-
category_labels.append(
|
245 |
-
category_labels.append(
|
246 |
category_labels.append(dropdown_model + " /Turn 1")
|
247 |
category_labels.append(dropdown_model + " /Turn 2")
|
248 |
|
@@ -250,37 +319,77 @@ def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
|
250 |
return fig
|
251 |
|
252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
#gradio
|
254 |
-
with gr.Blocks() as demo:
|
255 |
gr.Markdown("")
|
256 |
-
gr.Markdown("# ๐
|
257 |
gr.Markdown("")
|
258 |
-
gr.Markdown("")
|
259 |
-
gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
260 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
261 |
-
gr.Markdown("-
|
262 |
-
gr.Markdown("-
|
263 |
-
gr.Markdown("")
|
264 |
gr.Markdown("")
|
|
|
|
|
265 |
gr.Markdown("")
|
266 |
|
267 |
-
with gr.
|
268 |
-
gr.
|
269 |
-
|
270 |
-
|
271 |
-
with gr.TabItem("Keval Judgment"):
|
272 |
-
gr.Dataframe(value=df_keval)
|
273 |
-
with gr.TabItem("Model Detail View"):
|
274 |
-
with gr.Blocks():
|
275 |
with gr.Row():
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import numpy as np
|
4 |
import random
|
5 |
import plotly.graph_objects as go
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
import plotly.express as px
|
8 |
|
9 |
file_result_score = 'ko_bench.csv'
|
10 |
|
11 |
file_full_lb = 'mt_bench_240805.csv'
|
12 |
|
13 |
|
14 |
+
def add_hf_link(row):
|
15 |
+
organization, model = row['model'].split('__')
|
16 |
+
if organization.lower() not in ['google', 'openai', 'anthropic']:
|
17 |
+
row['link'] = f"https://huggingface.co/{organization}/{model}"
|
18 |
+
if organization.lower() == 'google' and 'gemini' in model:
|
19 |
+
row['link'] = "https://ai.google.dev/gemini-api"
|
20 |
+
return row
|
21 |
+
|
22 |
# read csv
|
23 |
df_result_score = pd.read_csv(file_result_score)
|
24 |
df_full_lb = pd.read_csv(file_full_lb)
|
25 |
|
|
|
26 |
# dataframe
|
27 |
df = pd.DataFrame(df_result_score)
|
28 |
+
df['model'] = df['model'].str.split('__').str[1]
|
29 |
+
|
30 |
df_rs = pd.DataFrame(df_result_score)
|
31 |
+
df_rs['link'] = ''
|
32 |
+
df_rs = df_rs.apply(add_hf_link, axis=1)
|
33 |
+
df_rs['organization'] = df_rs['model'].str.split('__').str[0]
|
34 |
+
df_rs['model'] = df_rs['model'].str.split('__').str[1]
|
35 |
df_full_lboard = pd.DataFrame(df_full_lb)
|
36 |
|
37 |
df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋ก ๋ณ๊ฒฝ
|
|
|
38 |
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
39 |
|
40 |
def custom_mean(series):
|
41 |
+
if series.name == 'link' or series.name == 'organization':
|
42 |
+
return series.values[0]
|
43 |
numeric_series = pd.to_numeric(series, errors='coerce') # ์๋ฆฌ์ฆ๋ฅผ ์ซ์๋ก ๋ณํ
|
44 |
return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์๋ ๊ฐ์ด ํ๋๋ผ๋ ์์ผ๋ฉด ํ๊ท ๊ณ์ฐ
|
45 |
|
|
|
50 |
return matching_rows['MT-bench (score)'].values[0]
|
51 |
return ''
|
52 |
|
53 |
+
def get_organization(row): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
54 |
+
model = row['model']
|
55 |
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
|
56 |
return 'Mistral'
|
57 |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
|
|
61 |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
62 |
if not matching_rows.empty:
|
63 |
return matching_rows['Organization'].values[0]
|
64 |
+
|
65 |
+
if row['organization'] != '' and pd.notna(row['organization']):
|
66 |
+
organization = row['organization'].lower()
|
67 |
+
if organization == 'qwen':
|
68 |
+
return 'Alibaba'
|
69 |
+
elif organization == 'google':
|
70 |
+
return 'Google'
|
71 |
+
elif organization == 'lgai-exaone':
|
72 |
+
return 'LGAI'
|
73 |
+
|
74 |
+
return row['organization']
|
75 |
|
76 |
def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
77 |
+
if pd.Series(model).str.contains('mistral-large|WizardLM-2-8x22B|ko-gemma-2', case=False, regex=True).any():
|
78 |
return 'Apache-2.0'
|
79 |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
80 |
return 'llama3'
|
81 |
+
elif pd.Series(model).str.contains('Ko-Llama-3-8B-Instruct', case=False, regex=True).any():
|
82 |
+
return 'Llama Community'
|
83 |
+
elif pd.Series(model).str.contains('claude|gemini|EXAONE-3.0-7.8B-Instruct', case=False, regex=True).any():
|
84 |
+
return 'Proprietary'
|
85 |
+
elif pd.Series(model).str.contains('qwen', case=False, regex=True).any():
|
86 |
+
if pd.Series(model).str.contains('max', case=False, regex=True).any():
|
87 |
+
return 'Proprietary'
|
88 |
+
else:
|
89 |
+
return 'Qianwen LICENSE'
|
90 |
|
91 |
model_lower = model.lower()
|
92 |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
|
|
94 |
return matching_rows['License'].values[0]
|
95 |
return ''
|
96 |
|
97 |
+
def get_link(row): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
98 |
+
if row['link'] != '' and pd.notna(row['link']):
|
99 |
+
return row
|
100 |
+
|
101 |
+
model_lower = row['model'].lower()
|
102 |
+
matching_rows = df_full_lboard[df_full_lboard['key'].str.lower() == model_lower]
|
103 |
+
if not matching_rows.empty:
|
104 |
+
row['link'] = matching_rows['Link'].values[0]
|
105 |
+
return row
|
106 |
+
|
107 |
+
def add_link(row):
|
108 |
+
if pd.isna(row['link']):
|
109 |
+
row['link'] = ''
|
110 |
+
if row['link'] != '':
|
111 |
+
row['model'] = f"<a href={row['link']}>{row['model']}</a>"
|
112 |
+
return row
|
113 |
|
114 |
# dataframe_full
|
115 |
df_full_rs = df_rs.copy()
|
116 |
+
df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True)
|
117 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
118 |
|
119 |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
|
|
121 |
df_full_rs = df_full_rs.round(2)
|
122 |
df_full_rs.replace("", np.nan, inplace=True)
|
123 |
|
124 |
+
df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval ์ด ์ถ๊ฐ
|
125 |
+
df_full_rs['Ko-Bench/keval'] = ''
|
126 |
for idx, j_model in df_full_rs['judge_model'].items():
|
127 |
if j_model == 'keval':
|
128 |
+
df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench']
|
129 |
else :
|
130 |
+
df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench']
|
131 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
132 |
|
133 |
+
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval ํ ํฉ๋ณ
|
134 |
df_full_rs = df_full_rs.round(2)
|
135 |
df_full_rs.replace("", np.nan, inplace=True)
|
136 |
|
|
|
139 |
df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
|
140 |
|
141 |
df_full_rs['Organization'] = '' # Organization ์ด ์ถ๊ฐ
|
142 |
+
df_full_rs['Organization'] = df_full_rs.apply(get_organization, axis=1 )
|
143 |
|
144 |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
145 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
146 |
|
147 |
+
df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False)
|
148 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
|
|
149 |
|
150 |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
151 |
|
152 |
+
df_full_rs = df_full_rs.apply(get_link, axis=1)
|
153 |
+
df_full_rs = df_full_rs.apply(add_link, axis=1)
|
154 |
+
|
155 |
+
df_full_rs = df_full_rs.drop(columns=['Ko-Bench', 'link', 'organization'])
|
156 |
|
157 |
# dataframe
|
158 |
df_rs['MT-Bench'] = '' # MT-Bench ์ด ์ถ๊ฐ
|
|
|
170 |
df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
|
171 |
df_openai = df_openai.round(2)
|
172 |
|
173 |
+
df_openai = df_openai.apply(get_link, axis=1)
|
174 |
+
df_openai = df_openai.apply(add_link, axis=1)
|
175 |
+
df_openai = df_openai.drop(columns=['link', 'organization'])
|
176 |
+
|
177 |
df_openai = df_openai.sort_values(by='score', ascending=False)
|
178 |
df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
|
179 |
|
|
|
186 |
df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
|
187 |
df_keval = df_keval.round(2)
|
188 |
|
189 |
+
df_keval = df_keval.apply(get_link, axis=1)
|
190 |
+
df_keval = df_keval.apply(add_link, axis=1)
|
191 |
+
df_keval = df_keval.drop(columns=['link', 'organization'])
|
192 |
+
|
193 |
df_keval = df_keval.sort_values(by='score', ascending=False)
|
194 |
df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
|
195 |
|
|
|
269 |
return fig
|
270 |
|
271 |
def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
272 |
+
openai_top_model = df_openai.iat[0, df_openai.columns.get_loc('model')]
|
273 |
+
openai_top_model = BeautifulSoup(openai_top_model, 'html.parser').get_text()
|
274 |
+
|
275 |
+
condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == openai_top_model)
|
276 |
top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
277 |
|
278 |
+
condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == openai_top_model)
|
279 |
top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
280 |
|
281 |
condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
|
|
285 |
openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
286 |
|
287 |
category_labels = []
|
288 |
+
category_labels.append(openai_top_model + " /Turn 1")
|
289 |
+
category_labels.append(openai_top_model + " /Turn 2")
|
290 |
category_labels.append(dropdown_model + " /Turn 1")
|
291 |
category_labels.append(dropdown_model + " /Turn 2")
|
292 |
|
|
|
294 |
return fig
|
295 |
|
296 |
def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
297 |
+
keval_top_model = df_keval.iat[0, df_keval.columns.get_loc('model')]
|
298 |
+
keval_top_model = BeautifulSoup(keval_top_model, 'html.parser').get_text()
|
299 |
+
|
300 |
+
condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == keval_top_model)
|
301 |
top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
302 |
|
303 |
+
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == keval_top_model)
|
304 |
top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
305 |
|
306 |
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
|
|
310 |
keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
311 |
|
312 |
category_labels = []
|
313 |
+
category_labels.append(keval_top_model + " /Turn 1")
|
314 |
+
category_labels.append(keval_top_model + " /Turn 2")
|
315 |
category_labels.append(dropdown_model + " /Turn 1")
|
316 |
category_labels.append(dropdown_model + " /Turn 2")
|
317 |
|
|
|
319 |
return fig
|
320 |
|
321 |
|
322 |
+
# average
|
323 |
+
def plot_average():
|
324 |
+
fig = go.Figure()
|
325 |
+
colors = [px.colors.qualitative.Set2, px.colors.qualitative.Pastel2]
|
326 |
+
turn_df = df_full_rs
|
327 |
+
|
328 |
+
# gpt-4o
|
329 |
+
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/openai'], mode='lines+markers',
|
330 |
+
name=f'gpt-4o(Average)',
|
331 |
+
line=dict(color=colors[0][0], dash='dash'),
|
332 |
+
marker=dict(symbol='x', size=10)))
|
333 |
+
|
334 |
+
# keval
|
335 |
+
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/keval'], mode='lines+markers',
|
336 |
+
name=f'keval(Average)',
|
337 |
+
line=dict(color=colors[0][1]),
|
338 |
+
marker=dict(symbol='circle', size=10)))
|
339 |
+
|
340 |
+
fig.update_layout(
|
341 |
+
title=f'Comparison of OpenAI ko_bench and keval ko_bench (Average)',
|
342 |
+
xaxis_title='Model',
|
343 |
+
yaxis_title='Score',
|
344 |
+
legend_title='Metric',
|
345 |
+
hovermode='x unified',
|
346 |
+
template='plotly_white'
|
347 |
+
)
|
348 |
+
fig.update_yaxes(range=[0, 10])
|
349 |
+
fig.update_layout(legend_traceorder="reversed")
|
350 |
+
return fig
|
351 |
+
|
352 |
+
|
353 |
#gradio
|
354 |
+
with gr.Blocks(css='assets/leaderboard.css') as demo:
|
355 |
gr.Markdown("")
|
356 |
+
gr.Markdown("# ๐ Ko-Bench Leaderboard")
|
357 |
gr.Markdown("")
|
358 |
+
gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
|
|
359 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
360 |
+
gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
|
361 |
+
gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
|
|
|
362 |
gr.Markdown("")
|
363 |
+
gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
|
364 |
+
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
365 |
gr.Markdown("")
|
366 |
|
367 |
+
with gr.Row():
|
368 |
+
with gr.TabItem("Ko-Bench"):
|
369 |
+
gr.Dataframe(value=df_full_rs,
|
370 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_full_rs.columns])
|
|
|
|
|
|
|
|
|
371 |
with gr.Row():
|
372 |
+
with gr.TabItem("Average"):
|
373 |
+
gr.Plot(plot_average)
|
374 |
+
with gr.TabItem("Openai Judgment"):
|
375 |
+
gr.Dataframe(value=df_openai,
|
376 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_openai.columns])
|
377 |
+
with gr.TabItem("Keval Judgment"):
|
378 |
+
gr.Dataframe(value=df_keval,
|
379 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_keval.columns])
|
380 |
+
with gr.TabItem("Model Detail View"):
|
381 |
+
with gr.Blocks():
|
382 |
+
with gr.Row():
|
383 |
+
dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
|
384 |
+
with gr.Row():
|
385 |
+
dataframe = gr.Dataframe(label="Model Detail View")
|
386 |
+
dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
|
387 |
+
with gr.Row():
|
388 |
+
plot_openai = gr.Plot(label="Openai Plot")
|
389 |
+
dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
|
390 |
+
plot_keval = gr.Plot(label="Keval Plot")
|
391 |
+
dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
|
392 |
+
|
393 |
+
|
394 |
+
|
395 |
+
demo.launch(share=True, server_name="0.0.0.0", debug=True)
|
ko_bench.csv
CHANGED
@@ -1,89 +1,93 @@
|
|
1 |
judge_model,turn,model,score,Coding,Extraction,Humanities,Math,Reasoning,Roleplay,STEM,Writing
|
2 |
-
gpt-4o,1,
|
3 |
-
gpt-4o,1,
|
4 |
-
gpt-4o,1,
|
5 |
-
gpt-4o,1,
|
6 |
-
gpt-4o,1,
|
7 |
-
gpt-4o,1,
|
8 |
-
gpt-4o,1,
|
9 |
-
gpt-4o,1,
|
10 |
-
gpt-4o,1,
|
11 |
-
gpt-4o,1,gemma-2-9b-it,7.
|
12 |
-
gpt-4o,1,
|
13 |
-
gpt-4o,1,
|
14 |
-
gpt-4o,1,
|
15 |
-
gpt-4o,1,
|
16 |
-
gpt-4o,1,
|
17 |
-
gpt-4o,1,
|
18 |
-
gpt-4o,1,
|
19 |
-
gpt-4o,1,
|
20 |
-
gpt-4o,1,
|
21 |
-
gpt-4o,1,
|
22 |
-
gpt-4o,1,
|
23 |
-
gpt-4o,1,
|
24 |
-
gpt-4o,
|
25 |
-
gpt-4o,2,
|
26 |
-
gpt-4o,2,
|
27 |
-
gpt-4o,2,
|
28 |
-
gpt-4o,2,
|
29 |
-
gpt-4o,2,
|
30 |
-
gpt-4o,2,
|
31 |
-
gpt-4o,2,
|
32 |
-
gpt-4o,2,
|
33 |
-
gpt-4o,2,
|
34 |
-
gpt-4o,2,gemma-2-9b-it,6.
|
35 |
-
gpt-4o,2,
|
36 |
-
gpt-4o,2,
|
37 |
-
gpt-4o,2,
|
38 |
-
gpt-4o,2,
|
39 |
-
gpt-4o,2,
|
40 |
-
gpt-4o,2,
|
41 |
-
gpt-4o,2,
|
42 |
-
gpt-4o,2,
|
43 |
-
gpt-4o,2,
|
44 |
-
gpt-4o,2,
|
45 |
-
gpt-4o,2,
|
46 |
-
|
47 |
-
|
48 |
-
keval,1,
|
49 |
-
keval,1,
|
50 |
-
keval,1,
|
51 |
-
keval,1,
|
52 |
-
keval,1,
|
53 |
-
keval,1,
|
54 |
-
keval,1,
|
55 |
-
keval,1,
|
56 |
-
keval,1,
|
57 |
-
keval,1,
|
58 |
-
keval,1,
|
59 |
-
keval,1,
|
60 |
-
keval,1,
|
61 |
-
keval,1,
|
62 |
-
keval,1,
|
63 |
-
keval,1,
|
64 |
-
keval,1,
|
65 |
-
keval,1,
|
66 |
-
keval,1,
|
67 |
-
keval,1,
|
68 |
-
keval,
|
69 |
-
keval,
|
70 |
-
keval,
|
71 |
-
keval,2,
|
72 |
-
keval,2,
|
73 |
-
keval,2,
|
74 |
-
keval,2,
|
75 |
-
keval,2,
|
76 |
-
keval,2,
|
77 |
-
keval,2,
|
78 |
-
keval,2,
|
79 |
-
keval,2,
|
80 |
-
keval,2,
|
81 |
-
keval,2,
|
82 |
-
keval,2,
|
83 |
-
keval,2,
|
84 |
-
keval,2,
|
85 |
-
keval,2,
|
86 |
-
keval,2,
|
87 |
-
keval,2,
|
88 |
-
keval,2,
|
89 |
-
keval,2,
|
|
|
|
|
|
|
|
|
|
1 |
judge_model,turn,model,score,Coding,Extraction,Humanities,Math,Reasoning,Roleplay,STEM,Writing
|
2 |
+
gpt-4o,1,openai__GPT-4o-2024-05-13,9.4,8.7,9.6,9.6,9.9,9.0,9.2,9.7,9.3
|
3 |
+
gpt-4o,1,Anthropic__claude-3-5-sonnet-20240620,9.0,6.7,9.5,9.2,9.6,9.3,8.7,9.8,9.0
|
4 |
+
gpt-4o,1,openai__gpt-4-0125-preview,8.9,7.7,9.8,9.1,9.7,7.8,9.2,8.7,9.4
|
5 |
+
gpt-4o,1,openai__GPT-4o-mini-2024-07-18,8.8,7.3,9.2,9.4,10.0,6.9,8.7,9.6,9.1
|
6 |
+
gpt-4o,1,Anthropic__claude-3-opus-20240229,8.6,8.1,9.7,9.3,8.7,5.8,8.2,9.4,9.5
|
7 |
+
gpt-4o,1,mistralai__Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
|
8 |
+
gpt-4o,1,Qwen__Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
|
9 |
+
gpt-4o,1,google__gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
|
10 |
+
gpt-4o,1,google__gemini-1.5-pro,8.2,5.5,9.7,8.7,7.5,6.5,9.1,9.4,9.2
|
11 |
+
gpt-4o,1,davidkim205__ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
|
12 |
+
gpt-4o,1,google__gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
|
13 |
+
gpt-4o,1,alpindale__WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
|
14 |
+
gpt-4o,1,openai__gpt-3.5-turbo-0125,6.7,5.2,9.0,7.7,6.4,3.3,7.2,6.5,8.6
|
15 |
+
gpt-4o,1,meta-llama__Meta-Llama-3.1-70B-Instruct,6.6,6.4,8.7,8.0,4.5,4.0,7.9,7.4,5.9
|
16 |
+
gpt-4o,1,Qwen__Qwen2-7B-Instruct,6.5,3.9,9.0,8.0,5.6,3.6,7.0,6.6,8.2
|
17 |
+
gpt-4o,1,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,6.2,4.9,7.4,7.1,7.3,5.1,6.4,4.1,7.6
|
18 |
+
gpt-4o,1,Qwen__Qwen1.5-32B-Chat,6.1,4.0,8.6,8.5,4.7,2.6,6.3,7.5,6.7
|
19 |
+
gpt-4o,1,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,5.8,3.5,5.0,8.5,5.4,3.2,5.4,7.5,7.6
|
20 |
+
gpt-4o,1,davidkim205__Ko-Llama-3-8B-Instruct,5.7,4.6,7.0,7.7,2.8,2.5,6.2,6.9,7.6
|
21 |
+
gpt-4o,1,meta-llama__Meta-Llama-3.1-8B-Instruct,5.4,4.6,7.4,6.3,5.2,3.3,5.2,5.4,6.0
|
22 |
+
gpt-4o,1,Qwen__Qwen1.5-14B-Chat,5.4,3.3,7.2,6.8,4.2,2.0,5.7,6.7,7.2
|
23 |
+
gpt-4o,1,WizardLMTeam__WizardLM-13B-V1.2,4.8,3.4,8.2,6.1,2.2,3.4,5.0,4.3,6.1
|
24 |
+
gpt-4o,1,mistralai__Mistral-7B-Instruct-v0.2,2.6,3.0,3.7,2.0,1.7,1.3,4.5,1.4,3.1
|
25 |
+
gpt-4o,2,openai__GPT-4o-2024-05-13,8.3,7.9,8.9,9.2,8.1,7.0,8.9,8.7,7.5
|
26 |
+
gpt-4o,2,openai__gpt-4-0125-preview,8.0,7.2,8.5,8.9,6.8,7.3,8.7,8.1,8.6
|
27 |
+
gpt-4o,2,Anthropic__claude-3-5-sonnet-20240620,7.9,6.9,9.1,9.0,6.4,6.9,8.1,8.2,8.4
|
28 |
+
gpt-4o,2,openai__GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
|
29 |
+
gpt-4o,2,mistralai__Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
|
30 |
+
gpt-4o,2,google__gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
|
31 |
+
gpt-4o,2,google__gemini-1.5-pro,7.0,6.3,7.7,8.3,6.1,5.0,8.5,7.8,6.5
|
32 |
+
gpt-4o,2,Anthropic__claude-3-opus-20240229,6.9,6.0,9.0,7.3,6.2,5.8,7.3,6.5,7.5
|
33 |
+
gpt-4o,2,Qwen__Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
|
34 |
+
gpt-4o,2,davidkim205__ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
|
35 |
+
gpt-4o,2,alpindale__WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
|
36 |
+
gpt-4o,2,google__gemma-2-9b-it,6.2,4.8,7.6,8.3,4.9,3.9,7.0,7.4,6.1
|
37 |
+
gpt-4o,2,Qwen__Qwen1.5-32B-Chat,5.8,4.3,8.2,7.6,3.8,3.0,6.8,5.9,6.9
|
38 |
+
gpt-4o,2,meta-llama__Meta-Llama-3.1-70B-Instruct,5.7,5.5,8.0,7.4,3.6,2.9,6.6,5.7,5.7
|
39 |
+
gpt-4o,2,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,5.6,5.8,6.2,5.5,7.0,4.0,5.7,4.3,6.5
|
40 |
+
gpt-4o,2,openai__gpt-3.5-turbo-0125,5.4,5.8,5.7,7.2,4.4,3.0,6.6,4.4,6.4
|
41 |
+
gpt-4o,2,Qwen__Qwen2-7B-Instruct,5.3,5.0,7.0,6.6,5.1,2.7,5.6,4.8,5.9
|
42 |
+
gpt-4o,2,Qwen__Qwen1.5-14B-Chat,4.9,3.5,5.1,7.4,4.1,2.7,5.9,5.0,5.9
|
43 |
+
gpt-4o,2,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,4.5,3.3,3.8,7.6,4.9,2.1,5.6,5.7,3.3
|
44 |
+
gpt-4o,2,mistralai__Mistral-7B-Instruct-v0.2,4.5,3.9,4.4,6.8,2.2,2.4,6.2,5.6,4.6
|
45 |
+
gpt-4o,2,davidkim205__Ko-Llama-3-8B-Instruct,4.0,3.7,4.3,6.4,2.8,2.3,4.9,4.0,4.1
|
46 |
+
gpt-4o,2,meta-llama__Meta-Llama-3.1-8B-Instruct,3.9,4.1,5.0,4.8,3.8,2.1,4.0,3.5,3.6
|
47 |
+
gpt-4o,2,WizardLMTeam__WizardLM-13B-V1.2,3.0,2.6,3.5,3.6,1.8,2.3,3.7,3.3,2.8
|
48 |
+
keval,1,openai__GPT-4o-2024-05-13,9.1,7.8,9.5,9.6,9.9,8.8,8.7,9.3,9.2
|
49 |
+
keval,1,Anthropic__claude-3-5-sonnet-20240620,9.0,7.2,9.8,9.2,9.3,9.2,8.9,9.4,9.0
|
50 |
+
keval,1,openai__gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
|
51 |
+
keval,1,openai__GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
|
52 |
+
keval,1,Anthropic__claude-3-opus-20240229,8.4,8.1,9.8,8.7,8.3,5.8,7.9,9.2,9.0
|
53 |
+
keval,1,mistralai__Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
|
54 |
+
keval,1,google__gemini-1.5-pro,8.2,5.7,9.8,8.8,7.4,6.2,9.1,9.7,9.0
|
55 |
+
keval,1,google__gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
|
56 |
+
keval,1,Qwen__Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
|
57 |
+
keval,1,davidkim205__ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
|
58 |
+
keval,1,google__gemma-2-9b-it,7.6,6.7,8.8,8.5,5.2,5.5,9.0,8.6,8.5
|
59 |
+
keval,1,meta-llama__Meta-Llama-3.1-70B-Instruct,7.3,6.8,9.0,8.3,5.9,5.1,8.4,8.0,7.1
|
60 |
+
keval,1,Qwen__Qwen1.5-14B-Chat,7.2,4.7,9.7,8.8,4.5,4.8,8.1,8.9,8.4
|
61 |
+
keval,1,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,7.2,5.7,8.8,8.1,8.2,6.0,7.7,5.6,7.3
|
62 |
+
keval,1,alpindale__WizardLM-2-8x22B,7.1,6.1,5.6,7.9,8.8,5.9,6.5,8.7,7.1
|
63 |
+
keval,1,Qwen__Qwen1.5-32B-Chat,7.0,3.9,9.9,8.9,5.8,3.6,7.1,8.6,7.9
|
64 |
+
keval,1,openai__gpt-3.5-turbo-0125,6.9,5.6,8.9,7.7,6.4,3.2,7.4,7.5,8.6
|
65 |
+
keval,1,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,6.8,3.4,8.6,8.5,5.5,4.1,6.9,8.8,8.4
|
66 |
+
keval,1,Qwen__Qwen2-7B-Instruct,6.4,3.6,9.0,7.7,5.5,3.5,7.1,6.7,8.4
|
67 |
+
keval,1,meta-llama__Meta-Llama-3.1-8B-Instruct,6.3,4.3,8.9,7.7,5.3,3.3,7.3,6.0,7.5
|
68 |
+
keval,1,davidkim205__Ko-Llama-3-8B-Instruct,6.0,5.0,7.4,7.6,2.9,2.9,7.0,8.0,7.6
|
69 |
+
keval,1,WizardLMTeam__WizardLM-13B-V1.2,6.0,3.7,9.3,7.7,2.4,3.8,7.0,6.6,7.7
|
70 |
+
keval,1,mistralai__Mistral-7B-Instruct-v0.2,3.0,3.0,6.7,3.0,2.0,2.0,3.3,1.9,2.4
|
71 |
+
keval,2,openai__GPT-4o-2024-05-13,8.1,7.7,8.9,9.2,7.8,6.9,8.4,8.7,7.4
|
72 |
+
keval,2,openai__gpt-4-0125-preview,7.7,6.3,8.4,8.8,6.9,6.3,8.6,8.6,8.0
|
73 |
+
keval,2,openai__GPT-4o-mini-2024-07-18,7.4,6.8,7.6,8.7,7.7,4.3,7.8,8.4,7.8
|
74 |
+
keval,2,Anthropic__claude-3-5-sonnet-20240620,7.3,6.6,7.6,9.0,6.6,5.7,7.6,8.1,7.1
|
75 |
+
keval,2,mistralai__Mistral-Large-Instruct-2407,7.0,5.4,7.3,8.5,7.3,5.2,7.9,7.8,6.9
|
76 |
+
keval,2,Qwen__Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
|
77 |
+
keval,2,google__gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
|
78 |
+
keval,2,Anthropic__claude-3-opus-20240229,6.8,6.2,8.4,7.8,5.4,5.1,7.0,7.3,7.5
|
79 |
+
keval,2,alpindale__WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
|
80 |
+
keval,2,google__gemini-1.5-pro,6.5,5.2,6.9,8.4,6.0,4.8,8.1,7.3,5.4
|
81 |
+
keval,2,davidkim205__ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
|
82 |
+
keval,2,google__gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
|
83 |
+
keval,2,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8
|
84 |
+
keval,2,Qwen__Qwen1.5-32B-Chat,6.2,5.2,7.7,8.0,4.1,4.0,7.7,6.7,6.5
|
85 |
+
keval,2,Qwen__Qwen1.5-14B-Chat,6.0,4.7,6.9,7.9,4.8,3.8,7.2,6.3,6.7
|
86 |
+
keval,2,meta-llama__Meta-Llama-3.1-70B-Instruct,6.0,6.0,7.3,7.6,5.6,2.9,7.0,6.2,5.6
|
87 |
+
keval,2,Qwen__Qwen2-7B-Instruct,5.6,4.9,7.0,6.5,5.1,3.1,6.3,5.0,6.5
|
88 |
+
keval,2,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,5.5,4.6,4.9,6.7,5.9,3.2,6.9,6.8,5.2
|
89 |
+
keval,2,openai__gpt-3.5-turbo-0125,5.3,6.2,5.5,7.0,4.5,3.3,6.2,4.5,5.4
|
90 |
+
keval,2,meta-llama__Meta-Llama-3.1-8B-Instruct,4.8,5.0,6.0,5.5,4.4,2.6,5.9,5.0,4.4
|
91 |
+
keval,2,davidkim205__Ko-Llama-3-8B-Instruct,4.2,3.6,4.6,6.3,2.8,2.2,6.1,3.7,4.3
|
92 |
+
keval,2,WizardLMTeam__WizardLM-13B-V1.2,4.1,3.7,5.4,5.8,2.8,3.0,5.6,3.3,3.4
|
93 |
+
keval,2,mistralai__Mistral-7B-Instruct-v0.2,4.1,3.5,6.1,6.3,2.6,2.2,3.5,3.2,5.5
|