linjunyao commited on
Commit
80c5be6
1 Parent(s): b7d59b5

updated dataset; changed tab and cell formats

Browse files
app.py CHANGED
@@ -65,10 +65,10 @@ with gr.Blocks() as demo:
65
  for cur_id, (filename, filepath) in enumerate(LEADERBOARD_FILE_MAPPING.items()):
66
 
67
  tab_name = filename
68
- if filename == "overall":
69
- tab_name = '🏅 JudgerBench Main Leaderboard'
70
 
71
- with gr.Tab(tab_name, elem_id=f'tab_{cur_id}', id=cur_id):
72
 
73
  # gr.Markdown(LEADERBOARD_MD['MAIN'])
74
  # _, check_box = build_l1_df(MAIN_FIELDS)
@@ -128,10 +128,10 @@ with gr.Blocks() as demo:
128
 
129
  table_styler = (
130
  table.style.apply(cell_styler, axis=None)
131
- .format(precision=3)
132
  )
133
  else:
134
- table_styler = table.style.format(prevision=3)
135
 
136
  # with gr.Row():
137
  # model_size = gr.CheckboxGroup(
@@ -291,7 +291,7 @@ if __name__ == '__main__':
291
 
292
  parser = argparse.ArgumentParser()
293
  parser.add_argument("--host", type=str, default="0.0.0.0")
294
- parser.add_argument("--port", type=int)
295
  parser.add_argument(
296
  "--share",
297
  action="store_true",
 
65
  for cur_id, (filename, filepath) in enumerate(LEADERBOARD_FILE_MAPPING.items()):
66
 
67
  tab_name = filename
68
+ # if filename == "overall":
69
+ # tab_name = '🏅 JudgerBench Main Leaderboard'
70
 
71
+ with gr.Tab(tab_name.upper(), elem_id=f'tab_{cur_id}', id=cur_id):
72
 
73
  # gr.Markdown(LEADERBOARD_MD['MAIN'])
74
  # _, check_box = build_l1_df(MAIN_FIELDS)
 
128
 
129
  table_styler = (
130
  table.style.apply(cell_styler, axis=None)
131
+ .format(precision=1)
132
  )
133
  else:
134
+ table_styler = table.style.format(precision=1)
135
 
136
  # with gr.Row():
137
  # model_size = gr.CheckboxGroup(
 
291
 
292
  parser = argparse.ArgumentParser()
293
  parser.add_argument("--host", type=str, default="0.0.0.0")
294
+ parser.add_argument("--port", type=int, default="7860")
295
  parser.add_argument(
296
  "--share",
297
  action="store_true",
data/CJ-1-32B.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,Alignbench,ArenaHard,Fofo_en,Fofo_cn,Wildbench,Average
2
+ qwen2.5-72b-instruct-turbomind,70,84.28,64,70,39.71333333,65.59866667
3
+ qwen2.5-32b-instruct-turbomind,68.8,78.02,57,64,23.25,58.214
4
+ qwen2.5-14b-instruct-turbomind,68,71.25,53,56,22.71666667,54.19333333
5
+ qwen2.5-7b-instruct-turbomind,61.8,57.36,48,46,16.77333333,45.98666667
6
+ llama-3_1-70b-instruct-turbomind,54.6,60.71,55,44,12.50333333,45.36266667
7
+ internlm2_5-20b-chat-turbomind,63,31.62,41,53,5.703333333,38.86466667
8
+ llama-3_1-8b-instruct-turbomind,47.7,33.88,41,34,-3.286666667,30.65866667
9
+ internlm2_5-7b-chat-turbomind,57.5,17.16,38,45,-13.33,28.866
10
+ qwen2.5-3b-instruct-turbomind,55.6,31.44,31,30,-8.953333333,27.81733333
11
+ qwen2.5-1.5b-instruct-turbomind,46.8,11.37,13,14,-54.71,6.092
data/CJ-1-7B.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,Alignbench,ArenaHard,Fofo_en,Fofo_cn,Wildbench,Average
2
+ qwen2.5-72b-instruct-turbomind,67.2,81.83,50,58,35.74333333,58.55466667
3
+ qwen2.5-32b-instruct-turbomind,65.4,71.63,45,55,19.06,51.218
4
+ qwen2.5-14b-instruct-turbomind,64.8,71.71,44,48,19.62666667,49.62733333
5
+ qwen2.5-7b-instruct-turbomind,60.8,60.93,40,46,18.57333333,45.26066667
6
+ llama-3_1-70b-instruct-turbomind,53.2,59.24,45,38,9.636666667,41.01533333
7
+ internlm2_5-20b-chat-turbomind,60.5,33.58,41,49,5.103333333,37.83666667
8
+ llama-3_1-8b-instruct-turbomind,47.2,38.76,36,31,-1.173333333,30.35733333
9
+ qwen2.5-3b-instruct-turbomind,55.2,38.12,30,30,-3.023333333,30.05933333
10
+ internlm2_5-7b-chat-turbomind,57.2,17.9,34,43,-12.95333333,27.82933333
11
+ qwen2.5-1.5b-instruct-turbomind,47,14.58,16,14,-46.41,9.034
data/detail_a_cn.csv DELETED
@@ -1,11 +0,0 @@
1
- Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
2
- CJ-1-14B,0.69,0.61,0.51,0.55,0.71,0.68,0.6,0.58,0.61,0.65,0.619,Judge
3
- GPT-4o-0806,0.77,0.56,0.51,0.53,0.67,0.66,0.63,0.58,0.62,0.58,0.611,API
4
- CJ-1-32B,0.69,0.58,0.53,0.52,0.71,0.53,0.6,0.61,0.61,0.69,0.607,Judge
5
- Skywork-llama3.1-8B,0.62,0.58,0.58,0.59,0.63,0.58,0.6,0.61,0.6,0.61,0.6,Judge
6
- Qwen2.5-72B-Chat,0.65,0.47,0.49,0.47,0.71,0.6,0.57,0.58,0.69,0.6,0.583,General
7
- CJ-1-7B,0.62,0.54,0.41,0.58,0.7,0.6,0.59,0.56,0.59,0.6,0.579,Judge
8
- Qwen2-72B-Chat,0.62,0.54,0.34,0.55,0.68,0.63,0.58,0.58,0.62,0.64,0.578,General
9
- Selftaught-llama3.1-70B,0.62,0.56,0.55,0.48,0.67,0.55,0.57,0.57,0.51,0.61,0.569,Judge
10
- Qwen2.5-7B-Chat,0.46,0.58,0.36,0.45,0.7,0.53,0.52,0.53,0.52,0.64,0.529,General
11
- CJ-1-1.5B,0.54,0.58,0.38,0.38,0.62,0.63,0.54,0.52,0.55,0.54,0.528,Judge
 
 
 
 
 
 
 
 
 
 
 
 
data/detail_a_en.csv DELETED
@@ -1,11 +0,0 @@
1
- Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
2
- GPT-4o-0806,0.82,0.53,0.62,0.61,0.83,0.67,0.67,0.73,0.64,0.55,0.667,API
3
- Skywork-llama3.1-8B,0.69,0.61,0.54,0.62,0.63,0.64,0.6,0.69,0.74,0.53,0.629,Judge
4
- Qwen2.5-72B-Chat,0.68,0.57,0.57,0.47,0.78,0.64,0.58,0.75,0.61,0.52,0.617,General
5
- CJ-1-32B,0.66,0.57,0.56,0.59,0.78,0.58,0.55,0.75,0.6,0.49,0.613,Judge
6
- CJ-1-14B,0.66,0.51,0.57,0.54,0.72,0.61,0.56,0.74,0.61,0.47,0.599,Judge
7
- Qwen2-72B-Chat,0.63,0.59,0.54,0.49,0.62,0.64,0.6,0.74,0.51,0.52,0.588,General
8
- CJ-1-7B,0.56,0.56,0.51,0.47,0.68,0.58,0.58,0.75,0.58,0.43,0.57,Judge
9
- Qwen2.5-7B-Chat,0.54,0.59,0.59,0.46,0.69,0.43,0.61,0.65,0.58,0.52,0.566,General
10
- CJ-1-1.5B,0.42,0.56,0.56,0.43,0.66,0.47,0.55,0.78,0.64,0.44,0.551,Judge
11
- Selftaught-llama3.1-70B,0.47,0.45,0.47,0.37,0.45,0.43,0.36,0.58,0.48,0.36,0.442,Judge
 
 
 
 
 
 
 
 
 
 
 
 
data/detail_b_acc.csv DELETED
@@ -1,9 +0,0 @@
1
- Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
2
- CJ-1-32B,0.857,0.806,0.596,0.621,0.72,Judge
3
- CJ-1-14B,0.839,0.787,0.566,0.602,0.699,Judge
4
- CJ-1-7B,0.816,0.783,0.564,0.586,0.687,Judge
5
- Qwen2.5-72B-Chat,0.878,0.677,0.599,0.57,0.681,General
6
- CJ-1-1.5B,0.822,0.712,0.55,0.43,0.629,Judge
7
- Qwen2-72B-Chat,0.867,0.692,0.564,0.376,0.625,General
8
- Selftaught-llama3.1-70B,0.755,0.627,0.538,0.472,0.598,Judge
9
- Qwen2.5-7B-Chat,0.777,0.67,0.47,0.444,0.59,General
 
 
 
 
 
 
 
 
 
 
data/detail_b_corr.csv DELETED
@@ -1,9 +0,0 @@
1
- Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
2
- CJ-1-32B,0.973,0.951,0.954,0.975,0.963,Judge
3
- CJ-1-14B,0.966,0.956,0.965,0.951,0.959,Judge
4
- CJ-1-7B,0.956,0.936,0.97,0.932,0.948,Judge
5
- Qwen2.5-72B-Chat,0.964,0.916,0.958,0.912,0.937,General
6
- Qwen2-72B-Chat,0.937,0.889,0.976,0.936,0.935,General
7
- CJ-1-1.5B,0.928,0.851,0.981,0.858,0.905,Judge
8
- Qwen2.5-7B-Chat,0.916,0.681,0.967,0.931,0.874,General
9
- Selftaught-llama3.1-70B,0.918,0.667,0.95,0.942,0.869,Judge
 
 
 
 
 
 
 
 
 
 
data/overall.csv DELETED
@@ -1,11 +0,0 @@
1
- Models,JDB-A EN,JDB-A CN,JDB-B Acc,JDB-B Corr,JudgerBench,Class
2
- GPT-4o-0806,0.664,0.608,1,1,0.818,API
3
- CJ-1-32B,0.614,0.612,0.72,0.963,0.727,Judge
4
- CJ-1-14B,0.599,0.615,0.699,0.959,0.718,Judge
5
- Qwen2.5-72B-Chat,0.615,0.59,0.681,0.937,0.706,General
6
- CJ-1-7B,0.57,0.583,0.687,0.948,0.697,Judge
7
- Qwen2-72B-Chat,0.588,0.584,0.625,0.935,0.683,General
8
- CJ-1-1.5B,0.553,0.527,0.629,0.905,0.654,Judge
9
- Qwen2.5-7B-Chat,0.567,0.535,0.59,0.874,0.641,General
10
- Selftaught-llama3.1-70B,0.443,0.57,0.598,0.869,0.62,Judge
11
- Skywork-llama3.1-8B,0.63,0.605,-,-,-,Judge
 
 
 
 
 
 
 
 
 
 
 
 
judgerbench/meta_data.py CHANGED
@@ -6,13 +6,10 @@ import os
6
  REPO_PATH = Path(os.path.dirname(os.path.dirname(__file__)))
7
  DATADIR = REPO_PATH / Path(os.getenv("DATADIR", "./data"))
8
 
9
- LEADERBOARD_FILE_MAPPING = dict(
10
- overall="overall.csv",
11
- detail_a_cn="detail_a_cn.csv",
12
- detail_a_en="detail_a_en.csv",
13
- detail_b_acc="detail_b_acc.csv",
14
- detail_b_corr="detail_b_corr.csv",
15
- )
16
 
17
  STYLE_CLASS_MAPPING = {
18
  "API": '#82e0aa',
@@ -33,11 +30,13 @@ CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
33
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
34
 
35
  # CONSTANTS-TEXT
36
- LEADERBORAD_INTRODUCTION = """# JudgerBench Leaderboard
37
 
38
- ### Welcome to the JudgerBench Leaderboard!
39
 
40
  This leaderboard was last updated: {}.
 
 
41
  """
42
 
43
  # CONSTANTS-FIELDS
 
6
  REPO_PATH = Path(os.path.dirname(os.path.dirname(__file__)))
7
  DATADIR = REPO_PATH / Path(os.getenv("DATADIR", "./data"))
8
 
9
+ LEADERBOARD_FILE_MAPPING = {
10
+ "cj-1-7b": "CJ-1-7B.csv",
11
+ "cj-1-32b": "CJ-1-32B.csv",
12
+ }
 
 
 
13
 
14
  STYLE_CLASS_MAPPING = {
15
  "API": '#82e0aa',
 
30
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
31
 
32
  # CONSTANTS-TEXT
33
+ LEADERBORAD_INTRODUCTION = """# CompassJudger Subjective Evaluation Learderboard
34
 
35
+ ### Welcome to the CompassJudger Subjective Evaluation Learderboard!
36
 
37
  This leaderboard was last updated: {}.
38
+
39
+ *All results from the corresponding datasets have been normalized to percentages.
40
  """
41
 
42
  # CONSTANTS-FIELDS
start_gradio_web_server.sh CHANGED
@@ -2,7 +2,7 @@ conda activate judgerbench
2
 
3
  # GRADIO_HOSTNAME=0.0.0.0
4
  GRADIO_HOSTNAME=$(hostname)
5
- GRADIO_PORT=7861
6
 
7
  # Set tmp and logs folders for gradio
8
  export TMPDIR="tmp"
 
2
 
3
  # GRADIO_HOSTNAME=0.0.0.0
4
  GRADIO_HOSTNAME=$(hostname)
5
+ GRADIO_PORT=7862
6
 
7
  # Set tmp and logs folders for gradio
8
  export TMPDIR="tmp"