linjunyao
commited on
Commit
•
80c5be6
1
Parent(s):
b7d59b5
updated dataset; changed tab and cell formats
Browse files- app.py +6 -6
- data/CJ-1-32B.csv +11 -0
- data/CJ-1-7B.csv +11 -0
- data/detail_a_cn.csv +0 -11
- data/detail_a_en.csv +0 -11
- data/detail_b_acc.csv +0 -9
- data/detail_b_corr.csv +0 -9
- data/overall.csv +0 -11
- judgerbench/meta_data.py +8 -9
- start_gradio_web_server.sh +1 -1
app.py
CHANGED
@@ -65,10 +65,10 @@ with gr.Blocks() as demo:
|
|
65 |
for cur_id, (filename, filepath) in enumerate(LEADERBOARD_FILE_MAPPING.items()):
|
66 |
|
67 |
tab_name = filename
|
68 |
-
if filename == "overall":
|
69 |
-
|
70 |
|
71 |
-
with gr.Tab(tab_name, elem_id=f'tab_{cur_id}', id=cur_id):
|
72 |
|
73 |
# gr.Markdown(LEADERBOARD_MD['MAIN'])
|
74 |
# _, check_box = build_l1_df(MAIN_FIELDS)
|
@@ -128,10 +128,10 @@ with gr.Blocks() as demo:
|
|
128 |
|
129 |
table_styler = (
|
130 |
table.style.apply(cell_styler, axis=None)
|
131 |
-
.format(precision=
|
132 |
)
|
133 |
else:
|
134 |
-
table_styler = table.style.format(
|
135 |
|
136 |
# with gr.Row():
|
137 |
# model_size = gr.CheckboxGroup(
|
@@ -291,7 +291,7 @@ if __name__ == '__main__':
|
|
291 |
|
292 |
parser = argparse.ArgumentParser()
|
293 |
parser.add_argument("--host", type=str, default="0.0.0.0")
|
294 |
-
parser.add_argument("--port", type=int)
|
295 |
parser.add_argument(
|
296 |
"--share",
|
297 |
action="store_true",
|
|
|
65 |
for cur_id, (filename, filepath) in enumerate(LEADERBOARD_FILE_MAPPING.items()):
|
66 |
|
67 |
tab_name = filename
|
68 |
+
# if filename == "overall":
|
69 |
+
# tab_name = '🏅 JudgerBench Main Leaderboard'
|
70 |
|
71 |
+
with gr.Tab(tab_name.upper(), elem_id=f'tab_{cur_id}', id=cur_id):
|
72 |
|
73 |
# gr.Markdown(LEADERBOARD_MD['MAIN'])
|
74 |
# _, check_box = build_l1_df(MAIN_FIELDS)
|
|
|
128 |
|
129 |
table_styler = (
|
130 |
table.style.apply(cell_styler, axis=None)
|
131 |
+
.format(precision=1)
|
132 |
)
|
133 |
else:
|
134 |
+
table_styler = table.style.format(precision=1)
|
135 |
|
136 |
# with gr.Row():
|
137 |
# model_size = gr.CheckboxGroup(
|
|
|
291 |
|
292 |
parser = argparse.ArgumentParser()
|
293 |
parser.add_argument("--host", type=str, default="0.0.0.0")
|
294 |
+
parser.add_argument("--port", type=int, default="7860")
|
295 |
parser.add_argument(
|
296 |
"--share",
|
297 |
action="store_true",
|
data/CJ-1-32B.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Alignbench,ArenaHard,Fofo_en,Fofo_cn,Wildbench,Average
|
2 |
+
qwen2.5-72b-instruct-turbomind,70,84.28,64,70,39.71333333,65.59866667
|
3 |
+
qwen2.5-32b-instruct-turbomind,68.8,78.02,57,64,23.25,58.214
|
4 |
+
qwen2.5-14b-instruct-turbomind,68,71.25,53,56,22.71666667,54.19333333
|
5 |
+
qwen2.5-7b-instruct-turbomind,61.8,57.36,48,46,16.77333333,45.98666667
|
6 |
+
llama-3_1-70b-instruct-turbomind,54.6,60.71,55,44,12.50333333,45.36266667
|
7 |
+
internlm2_5-20b-chat-turbomind,63,31.62,41,53,5.703333333,38.86466667
|
8 |
+
llama-3_1-8b-instruct-turbomind,47.7,33.88,41,34,-3.286666667,30.65866667
|
9 |
+
internlm2_5-7b-chat-turbomind,57.5,17.16,38,45,-13.33,28.866
|
10 |
+
qwen2.5-3b-instruct-turbomind,55.6,31.44,31,30,-8.953333333,27.81733333
|
11 |
+
qwen2.5-1.5b-instruct-turbomind,46.8,11.37,13,14,-54.71,6.092
|
data/CJ-1-7B.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Alignbench,ArenaHard,Fofo_en,Fofo_cn,Wildbench,Average
|
2 |
+
qwen2.5-72b-instruct-turbomind,67.2,81.83,50,58,35.74333333,58.55466667
|
3 |
+
qwen2.5-32b-instruct-turbomind,65.4,71.63,45,55,19.06,51.218
|
4 |
+
qwen2.5-14b-instruct-turbomind,64.8,71.71,44,48,19.62666667,49.62733333
|
5 |
+
qwen2.5-7b-instruct-turbomind,60.8,60.93,40,46,18.57333333,45.26066667
|
6 |
+
llama-3_1-70b-instruct-turbomind,53.2,59.24,45,38,9.636666667,41.01533333
|
7 |
+
internlm2_5-20b-chat-turbomind,60.5,33.58,41,49,5.103333333,37.83666667
|
8 |
+
llama-3_1-8b-instruct-turbomind,47.2,38.76,36,31,-1.173333333,30.35733333
|
9 |
+
qwen2.5-3b-instruct-turbomind,55.2,38.12,30,30,-3.023333333,30.05933333
|
10 |
+
internlm2_5-7b-chat-turbomind,57.2,17.9,34,43,-12.95333333,27.82933333
|
11 |
+
qwen2.5-1.5b-instruct-turbomind,47,14.58,16,14,-46.41,9.034
|
data/detail_a_cn.csv
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
|
2 |
-
CJ-1-14B,0.69,0.61,0.51,0.55,0.71,0.68,0.6,0.58,0.61,0.65,0.619,Judge
|
3 |
-
GPT-4o-0806,0.77,0.56,0.51,0.53,0.67,0.66,0.63,0.58,0.62,0.58,0.611,API
|
4 |
-
CJ-1-32B,0.69,0.58,0.53,0.52,0.71,0.53,0.6,0.61,0.61,0.69,0.607,Judge
|
5 |
-
Skywork-llama3.1-8B,0.62,0.58,0.58,0.59,0.63,0.58,0.6,0.61,0.6,0.61,0.6,Judge
|
6 |
-
Qwen2.5-72B-Chat,0.65,0.47,0.49,0.47,0.71,0.6,0.57,0.58,0.69,0.6,0.583,General
|
7 |
-
CJ-1-7B,0.62,0.54,0.41,0.58,0.7,0.6,0.59,0.56,0.59,0.6,0.579,Judge
|
8 |
-
Qwen2-72B-Chat,0.62,0.54,0.34,0.55,0.68,0.63,0.58,0.58,0.62,0.64,0.578,General
|
9 |
-
Selftaught-llama3.1-70B,0.62,0.56,0.55,0.48,0.67,0.55,0.57,0.57,0.51,0.61,0.569,Judge
|
10 |
-
Qwen2.5-7B-Chat,0.46,0.58,0.36,0.45,0.7,0.53,0.52,0.53,0.52,0.64,0.529,General
|
11 |
-
CJ-1-1.5B,0.54,0.58,0.38,0.38,0.62,0.63,0.54,0.52,0.55,0.54,0.528,Judge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/detail_a_en.csv
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
|
2 |
-
GPT-4o-0806,0.82,0.53,0.62,0.61,0.83,0.67,0.67,0.73,0.64,0.55,0.667,API
|
3 |
-
Skywork-llama3.1-8B,0.69,0.61,0.54,0.62,0.63,0.64,0.6,0.69,0.74,0.53,0.629,Judge
|
4 |
-
Qwen2.5-72B-Chat,0.68,0.57,0.57,0.47,0.78,0.64,0.58,0.75,0.61,0.52,0.617,General
|
5 |
-
CJ-1-32B,0.66,0.57,0.56,0.59,0.78,0.58,0.55,0.75,0.6,0.49,0.613,Judge
|
6 |
-
CJ-1-14B,0.66,0.51,0.57,0.54,0.72,0.61,0.56,0.74,0.61,0.47,0.599,Judge
|
7 |
-
Qwen2-72B-Chat,0.63,0.59,0.54,0.49,0.62,0.64,0.6,0.74,0.51,0.52,0.588,General
|
8 |
-
CJ-1-7B,0.56,0.56,0.51,0.47,0.68,0.58,0.58,0.75,0.58,0.43,0.57,Judge
|
9 |
-
Qwen2.5-7B-Chat,0.54,0.59,0.59,0.46,0.69,0.43,0.61,0.65,0.58,0.52,0.566,General
|
10 |
-
CJ-1-1.5B,0.42,0.56,0.56,0.43,0.66,0.47,0.55,0.78,0.64,0.44,0.551,Judge
|
11 |
-
Selftaught-llama3.1-70B,0.47,0.45,0.47,0.37,0.45,0.43,0.36,0.58,0.48,0.36,0.442,Judge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/detail_b_acc.csv
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
|
2 |
-
CJ-1-32B,0.857,0.806,0.596,0.621,0.72,Judge
|
3 |
-
CJ-1-14B,0.839,0.787,0.566,0.602,0.699,Judge
|
4 |
-
CJ-1-7B,0.816,0.783,0.564,0.586,0.687,Judge
|
5 |
-
Qwen2.5-72B-Chat,0.878,0.677,0.599,0.57,0.681,General
|
6 |
-
CJ-1-1.5B,0.822,0.712,0.55,0.43,0.629,Judge
|
7 |
-
Qwen2-72B-Chat,0.867,0.692,0.564,0.376,0.625,General
|
8 |
-
Selftaught-llama3.1-70B,0.755,0.627,0.538,0.472,0.598,Judge
|
9 |
-
Qwen2.5-7B-Chat,0.777,0.67,0.47,0.444,0.59,General
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/detail_b_corr.csv
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
|
2 |
-
CJ-1-32B,0.973,0.951,0.954,0.975,0.963,Judge
|
3 |
-
CJ-1-14B,0.966,0.956,0.965,0.951,0.959,Judge
|
4 |
-
CJ-1-7B,0.956,0.936,0.97,0.932,0.948,Judge
|
5 |
-
Qwen2.5-72B-Chat,0.964,0.916,0.958,0.912,0.937,General
|
6 |
-
Qwen2-72B-Chat,0.937,0.889,0.976,0.936,0.935,General
|
7 |
-
CJ-1-1.5B,0.928,0.851,0.981,0.858,0.905,Judge
|
8 |
-
Qwen2.5-7B-Chat,0.916,0.681,0.967,0.931,0.874,General
|
9 |
-
Selftaught-llama3.1-70B,0.918,0.667,0.95,0.942,0.869,Judge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/overall.csv
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
Models,JDB-A EN,JDB-A CN,JDB-B Acc,JDB-B Corr,JudgerBench,Class
|
2 |
-
GPT-4o-0806,0.664,0.608,1,1,0.818,API
|
3 |
-
CJ-1-32B,0.614,0.612,0.72,0.963,0.727,Judge
|
4 |
-
CJ-1-14B,0.599,0.615,0.699,0.959,0.718,Judge
|
5 |
-
Qwen2.5-72B-Chat,0.615,0.59,0.681,0.937,0.706,General
|
6 |
-
CJ-1-7B,0.57,0.583,0.687,0.948,0.697,Judge
|
7 |
-
Qwen2-72B-Chat,0.588,0.584,0.625,0.935,0.683,General
|
8 |
-
CJ-1-1.5B,0.553,0.527,0.629,0.905,0.654,Judge
|
9 |
-
Qwen2.5-7B-Chat,0.567,0.535,0.59,0.874,0.641,General
|
10 |
-
Selftaught-llama3.1-70B,0.443,0.57,0.598,0.869,0.62,Judge
|
11 |
-
Skywork-llama3.1-8B,0.63,0.605,-,-,-,Judge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
judgerbench/meta_data.py
CHANGED
@@ -6,13 +6,10 @@ import os
|
|
6 |
REPO_PATH = Path(os.path.dirname(os.path.dirname(__file__)))
|
7 |
DATADIR = REPO_PATH / Path(os.getenv("DATADIR", "./data"))
|
8 |
|
9 |
-
LEADERBOARD_FILE_MAPPING =
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
detail_b_acc="detail_b_acc.csv",
|
14 |
-
detail_b_corr="detail_b_corr.csv",
|
15 |
-
)
|
16 |
|
17 |
STYLE_CLASS_MAPPING = {
|
18 |
"API": '#82e0aa',
|
@@ -33,11 +30,13 @@ CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
|
33 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
34 |
|
35 |
# CONSTANTS-TEXT
|
36 |
-
LEADERBORAD_INTRODUCTION = """#
|
37 |
|
38 |
-
### Welcome to the
|
39 |
|
40 |
This leaderboard was last updated: {}.
|
|
|
|
|
41 |
"""
|
42 |
|
43 |
# CONSTANTS-FIELDS
|
|
|
6 |
REPO_PATH = Path(os.path.dirname(os.path.dirname(__file__)))
|
7 |
DATADIR = REPO_PATH / Path(os.getenv("DATADIR", "./data"))
|
8 |
|
9 |
+
LEADERBOARD_FILE_MAPPING = {
|
10 |
+
"cj-1-7b": "CJ-1-7B.csv",
|
11 |
+
"cj-1-32b": "CJ-1-32B.csv",
|
12 |
+
}
|
|
|
|
|
|
|
13 |
|
14 |
STYLE_CLASS_MAPPING = {
|
15 |
"API": '#82e0aa',
|
|
|
30 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
31 |
|
32 |
# CONSTANTS-TEXT
|
33 |
+
LEADERBORAD_INTRODUCTION = """# CompassJudger Subjective Evaluation Learderboard
|
34 |
|
35 |
+
### Welcome to the CompassJudger Subjective Evaluation Learderboard!
|
36 |
|
37 |
This leaderboard was last updated: {}.
|
38 |
+
|
39 |
+
*All results from the corresponding datasets have been normalized to percentages.
|
40 |
"""
|
41 |
|
42 |
# CONSTANTS-FIELDS
|
start_gradio_web_server.sh
CHANGED
@@ -2,7 +2,7 @@ conda activate judgerbench
|
|
2 |
|
3 |
# GRADIO_HOSTNAME=0.0.0.0
|
4 |
GRADIO_HOSTNAME=$(hostname)
|
5 |
-
GRADIO_PORT=
|
6 |
|
7 |
# Set tmp and logs folders for gradio
|
8 |
export TMPDIR="tmp"
|
|
|
2 |
|
3 |
# GRADIO_HOSTNAME=0.0.0.0
|
4 |
GRADIO_HOSTNAME=$(hostname)
|
5 |
+
GRADIO_PORT=7862
|
6 |
|
7 |
# Set tmp and logs folders for gradio
|
8 |
export TMPDIR="tmp"
|