linjunyao commited on
Commit
0bb476f
1 Parent(s): 77c1fdd

added leaderboard data; added Class coloring

Browse files
.gitignore ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**/*_bkup.*
2
+ # Python
3
+ __pycache__
4
+ .ipynb_checkpoints/
5
+ *.pyc
6
+ *.egg-info
7
+ dist
8
+ .venv
9
+
10
+ # Log
11
+ *.log
12
+ *.log.*
13
+ logs/controller/*
14
+ logs/conversation/*
15
+ logs/gradio_web_server/*
16
+ logs/gradio_web_server_multi/*
17
+ !logs/**/.keep
18
+ *.json
19
+ !playground/deepspeed_config_s2.json
20
+ !playground/deepspeed_config_s3.json
21
+
22
+ # Editor
23
+ .idea
24
+ *.swp
25
+
26
+ # Other
27
+ .DS_Store
28
+ wandb
29
+ output
30
+ checkpoints_flant5_3b
31
+
32
+ # Data
33
+ *.pkl
34
+ tests/state_of_the_union.txt
35
+
36
+ # Build
37
+ build
38
+
39
+ # Gradio Temp
40
+ tmp
41
+
42
+ # API KEYS
43
+ set_api_keys.sh
44
+
45
+ # data
46
+
47
+
48
+ !**/.keep
49
+
50
+ archive/*
app.py CHANGED
@@ -1,158 +1,282 @@
1
  import abc
2
 
3
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- from gen_table import *
6
- from meta_data import *
7
 
8
  with gr.Blocks() as demo:
9
- struct = load_results()
10
- timestamp = struct['time']
11
- EVAL_TIME = format_timestamp(timestamp)
12
- results = struct['results']
13
- N_MODEL = len(results)
14
- N_DATA = len(results['LLaVA-v1.5-7B']) - 1
15
- DATASETS = list(results['LLaVA-v1.5-7B'])
16
- DATASETS.remove('META')
17
- print(DATASETS)
18
-
19
- gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_MODEL, N_DATA, EVAL_TIME))
20
- structs = [abc.abstractproperty() for _ in range(N_DATA)]
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  with gr.Tabs(elem_classes='tab-buttons') as tabs:
23
- with gr.TabItem('🏅 OpenVLM Main Leaderboard', elem_id='main', id=0):
24
- gr.Markdown(LEADERBOARD_MD['MAIN'])
25
- _, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
26
- table = generate_table(results, DEFAULT_BENCH)
27
- table['Rank'] = list(range(1, len(table) + 1))
28
-
29
- type_map = check_box['type_map']
30
- type_map['Rank'] = 'number'
31
-
32
- checkbox_group = gr.CheckboxGroup(
33
- choices=check_box['all'],
34
- value=check_box['required'],
35
- label='Evaluation Dimension',
36
- interactive=True,
37
- )
38
-
39
- headers = ['Rank'] + check_box['essential'] + checkbox_group.value
40
- with gr.Row():
41
- model_size = gr.CheckboxGroup(
42
- choices=MODEL_SIZE,
43
- value=MODEL_SIZE,
44
- label='Model Size',
45
- interactive=True
46
- )
47
- model_type = gr.CheckboxGroup(
48
- choices=MODEL_TYPE,
49
- value=MODEL_TYPE,
50
- label='Model Type',
51
- interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  )
53
- data_component = gr.components.DataFrame(
54
- value=table[headers],
55
- type='pandas',
56
- datatype=[type_map[x] for x in headers],
57
- interactive=False,
58
- visible=True)
59
-
60
- def filter_df(fields, model_size, model_type):
61
- filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
62
- headers = ['Rank'] + check_box['essential'] + fields
63
-
64
- new_fields = [field for field in fields if field not in filter_list]
65
- df = generate_table(results, new_fields)
66
-
67
- df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
68
- df = df[df['flag']]
69
- df.pop('flag')
70
- if len(df):
71
- df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
72
- df = df[df['flag']]
73
- df.pop('flag')
74
- df['Rank'] = list(range(1, len(df) + 1))
75
-
76
- comp = gr.components.DataFrame(
77
- value=df[headers],
 
 
 
 
 
 
 
 
 
 
78
  type='pandas',
79
  datatype=[type_map[x] for x in headers],
80
  interactive=False,
81
- visible=True)
82
- return comp
83
-
84
- for cbox in [checkbox_group, model_size, model_type]:
85
- cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
86
-
87
- with gr.TabItem('🔍 About', elem_id='about', id=1):
88
- gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
89
-
90
- for i, dataset in enumerate(DATASETS):
91
- with gr.TabItem(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
92
- if dataset in LEADERBOARD_MD:
93
- gr.Markdown(LEADERBOARD_MD[dataset])
94
-
95
- s = structs[i]
96
- s.table, s.check_box = BUILD_L2_DF(results, dataset)
97
- s.type_map = s.check_box['type_map']
98
- s.type_map['Rank'] = 'number'
99
-
100
- s.checkbox_group = gr.CheckboxGroup(
101
- choices=s.check_box['all'],
102
- value=s.check_box['required'],
103
- label=f'{dataset} CheckBoxes',
104
- interactive=True,
105
  )
106
- s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
107
- s.table['Rank'] = list(range(1, len(s.table) + 1))
108
-
109
- with gr.Row():
110
- s.model_size = gr.CheckboxGroup(
111
- choices=MODEL_SIZE,
112
- value=MODEL_SIZE,
113
- label='Model Size',
114
- interactive=True
115
- )
116
- s.model_type = gr.CheckboxGroup(
117
- choices=MODEL_TYPE,
118
- value=MODEL_TYPE,
119
- label='Model Type',
120
- interactive=True
121
- )
122
- s.data_component = gr.components.DataFrame(
123
- value=s.table[s.headers],
124
- type='pandas',
125
- datatype=[s.type_map[x] for x in s.headers],
126
- interactive=False,
127
- visible=True)
128
- s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
129
-
130
- def filter_df_l2(dataset_name, fields, model_size, model_type):
131
- s = structs[DATASETS.index(dataset_name)]
132
- headers = ['Rank'] + s.check_box['essential'] + fields
133
- df = cp.deepcopy(s.table)
134
- df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
135
- df = df[df['flag']]
136
- df.pop('flag')
137
- if len(df):
138
- df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
139
- df = df[df['flag']]
140
- df.pop('flag')
141
  df['Rank'] = list(range(1, len(df) + 1))
142
 
143
- comp = gr.components.DataFrame(
144
  value=df[headers],
145
  type='pandas',
146
- datatype=[s.type_map[x] for x in headers],
147
  interactive=False,
148
- visible=True)
 
 
149
  return comp
150
 
151
- for cbox in [s.checkbox_group, s.model_size, s.model_type]:
152
- cbox.change(
153
- fn=filter_df_l2,
154
- inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
155
- outputs=s.data_component)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  with gr.Row():
158
  with gr.Accordion('Citation', open=False):
@@ -161,5 +285,73 @@ with gr.Blocks() as demo:
161
  label=CITATION_BUTTON_LABEL,
162
  elem_id='citation-button')
163
 
 
164
  if __name__ == '__main__':
165
- demo.launch(server_name='0.0.0.0')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import abc
2
 
3
  import gradio as gr
4
+ from loguru import logger
5
+ import pandas as pd
6
+ from collections import defaultdict
7
+
8
+ from judgerbench.preprocess.gen_table import (
9
+ format_timestamp,
10
+ generate_table,
11
+ build_l1_df,
12
+ # build_l2_df,
13
+ )
14
+ from judgerbench.meta_data import (
15
+ LEADERBORAD_INTRODUCTION,
16
+ LEADERBOARD_MD,
17
+ LEADERBOARD_FILE_MAPPING,
18
+ MAIN_FIELDS,
19
+ DEFAULT_BENCH,
20
+ STYLE_CLASS_MAPPING,
21
+ CITATION_BUTTON_TEXT,
22
+ CITATION_BUTTON_LABEL,
23
+ )
24
+
25
+
26
+ def refresh_dataframe(required_fields):
27
+ df = generate_table(MAIN_FIELDS)
28
+
29
+ comp = gr.DataFrame(
30
+ value=df,
31
+ type='pandas',
32
+ interactive=False,
33
+ visible=True
34
+ )
35
+
36
+ return comp
37
 
 
 
38
 
39
  with gr.Blocks() as demo:
40
+ # struct = load_results()
41
+ # timestamp = struct['time']
42
+
43
+ # EVAL_TIME = format_timestamp(timestamp)
44
+ EVAL_TIME = '20241015'
45
+
46
+ # results = struct['results']
47
+ # N_MODEL = len(results)
48
+ # N_DATA = len(results['LLaVA-v1.5-7B']) - 1
49
+
50
+ N_MODEL = 10
51
+ N_DATA = 100
52
+
53
+ # DATASETS = list(results['LLaVA-v1.5-7B'])
54
+ # DATASETS.remove('META')
55
+ # print(DATASETS)
56
+
57
+ gr.Markdown(LEADERBORAD_INTRODUCTION.format(
58
+ # N_MODEL,
59
+ # N_DATA,
60
+ EVAL_TIME
61
+ ))
62
+ # structs = [abc.abstractproperty() for _ in range(N_DATA)]
63
 
64
  with gr.Tabs(elem_classes='tab-buttons') as tabs:
65
+ for cur_id, (filename, filepath) in enumerate(LEADERBOARD_FILE_MAPPING.items()):
66
+
67
+ tab_name = filename
68
+ if filename == "overall":
69
+ tab_name = '🏅 JudgerBench Main Leaderboard'
70
+
71
+ with gr.Tab(tab_name, elem_id=f'tab_{cur_id}', id=cur_id):
72
+
73
+ # gr.Markdown(LEADERBOARD_MD['MAIN'])
74
+ # _, check_box = build_l1_df(MAIN_FIELDS)
75
+ table = generate_table(filename=filename)
76
+
77
+ # type_map = check_box['type_map']
78
+ type_map = defaultdict(lambda: 'number')
79
+ type_map['Model'] = 'str'
80
+ type_map['Class'] = 'str'
81
+ type_map['Rank'] = 'number'
82
+
83
+ # required_fields = gr.State(
84
+ # check_box['essential']
85
+ # # + ["Average"]
86
+ # )
87
+
88
+ # checkbox_group = gr.CheckboxGroup(
89
+ # choices=[item for item in check_box['all'] if item not in required_fields.value],
90
+ # value=[item for item in check_box['default'] if item not in required_fields.value],
91
+ # label='Evaluation Metrics',
92
+ # interactive=True,
93
+ # )
94
+
95
+ # headers = (
96
+ # ['Rank'] +
97
+ # required_fields.value +
98
+ # [item for item in check_box['all'] if item not in required_fields.value]
99
+ # # checkbox_group.value
100
+ # )
101
+
102
+ table['Rank'] = list(range(1, len(table) + 1))
103
+
104
+ # Rearrange columns
105
+ if "Class" in table.columns:
106
+ starting_columns = ["Rank", "Models", "Class"]
107
+ else:
108
+ starting_columns = ["Rank", "Models"]
109
+
110
+ table = table[starting_columns + [ col for col in table.columns if col not in starting_columns ]]
111
+
112
+ headers = (
113
+ # ['Rank'] +
114
+ list(table.columns)
115
  )
116
+
117
+ if "Class" in table.columns:
118
+ def cell_styler(v):
119
+ df = v.copy()
120
+
121
+ class_var = df[['Class']].copy()
122
+
123
+ df.loc[:, :] = ''
124
+ df[['Class']] = class_var.map(lambda x: f"background-color: {STYLE_CLASS_MAPPING[x]}")
125
+ logger.info(df['Class'])
126
+
127
+ return df
128
+
129
+ table_styler = (
130
+ table.style.apply(cell_styler, axis=None)
131
+ .format(precision=3)
132
+ )
133
+ else:
134
+ table_styler = table.style.format(prevision=3)
135
+
136
+ # with gr.Row():
137
+ # model_size = gr.CheckboxGroup(
138
+ # choices=MODEL_SIZE,
139
+ # value=MODEL_SIZE,
140
+ # label='Model Size',
141
+ # interactive=True
142
+ # )
143
+ # model_type = gr.CheckboxGroup(
144
+ # choices=MODEL_TYPE,
145
+ # value=MODEL_TYPE,
146
+ # label='Model Type',
147
+ # interactive=True
148
+ # )
149
+ data_component = gr.DataFrame(
150
+ value=table_styler,
151
  type='pandas',
152
  datatype=[type_map[x] for x in headers],
153
  interactive=False,
154
+ visible=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  )
156
+
157
+ def filter_df(
158
+ required_fields,
159
+ fields,
160
+ # model_size,
161
+ # model_type
162
+ ):
163
+ # filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
164
+ headers = ['Rank'] + required_fields + fields
165
+
166
+ # new_fields = [field for field in fields if field not in filter_list]
167
+ df = generate_table(fields)
168
+ logger.info(f"{df.columns=}")
169
+
170
+ # df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
171
+ # df = df[df['flag']]
172
+ # df.pop('flag')
173
+
174
+ # if len(df):
175
+ # df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
176
+ # df = df[df['flag']]
177
+ # df.pop('flag')
178
+
 
 
 
 
 
 
 
 
 
 
 
 
179
  df['Rank'] = list(range(1, len(df) + 1))
180
 
181
+ comp = gr.DataFrame(
182
  value=df[headers],
183
  type='pandas',
184
+ datatype=[type_map[x] for x in headers],
185
  interactive=False,
186
+ visible=True
187
+ )
188
+
189
  return comp
190
 
191
+ # for cbox in [
192
+ # # checkbox_group,
193
+ # # model_size,
194
+ # # model_type
195
+ # ]:
196
+ # cbox.change(
197
+ # fn=refresh_dataframe,
198
+ # inputs=[required_fields],
199
+ # outputs=data_component
200
+ # ).then(
201
+ # fn=filter_df,
202
+ # inputs=[
203
+ # required_fields,
204
+ # checkbox_group,
205
+ # # model_size,
206
+ # # model_type
207
+ # ],
208
+ # outputs=data_component
209
+ # )
210
+
211
+ # with gr.Tab('🔍 About', elem_id='about', id=1):
212
+ # gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
213
+
214
+ # for i, dataset in enumerate(DATASETS):
215
+ # with gr.Tab(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
216
+ # if dataset in LEADERBOARD_MD:
217
+ # gr.Markdown(LEADERBOARD_MD[dataset])
218
+
219
+ # s = structs[i]
220
+ # s.table, s.check_box = build_l2_df(results, dataset)
221
+ # s.type_map = s.check_box['type_map']
222
+ # s.type_map['Rank'] = 'number'
223
+
224
+ # s.checkbox_group = gr.CheckboxGroup(
225
+ # choices=s.check_box['all'],
226
+ # value=s.check_box['required'],
227
+ # label=f'{dataset} CheckBoxes',
228
+ # interactive=True,
229
+ # )
230
+ # s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
231
+ # s.table['Rank'] = list(range(1, len(s.table) + 1))
232
+
233
+ # with gr.Row():
234
+ # s.model_size = gr.CheckboxGroup(
235
+ # choices=MODEL_SIZE,
236
+ # value=MODEL_SIZE,
237
+ # label='Model Size',
238
+ # interactive=True
239
+ # )
240
+ # s.model_type = gr.CheckboxGroup(
241
+ # choices=MODEL_TYPE,
242
+ # value=MODEL_TYPE,
243
+ # label='Model Type',
244
+ # interactive=True
245
+ # )
246
+ # s.data_component = gr.components.DataFrame(
247
+ # value=s.table[s.headers],
248
+ # type='pandas',
249
+ # datatype=[s.type_map[x] for x in s.headers],
250
+ # interactive=False,
251
+ # visible=True)
252
+ # s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
253
+
254
+ # def filter_df_l2(dataset_name, fields, model_size, model_type):
255
+ # s = structs[DATASETS.index(dataset_name)]
256
+ # headers = ['Rank'] + s.check_box['essential'] + fields
257
+ # df = cp.deepcopy(s.table)
258
+ # df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
259
+ # df = df[df['flag']]
260
+ # df.pop('flag')
261
+ # if len(df):
262
+ # df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
263
+ # df = df[df['flag']]
264
+ # df.pop('flag')
265
+ # df['Rank'] = list(range(1, len(df) + 1))
266
+
267
+ # comp = gr.components.DataFrame(
268
+ # value=df[headers],
269
+ # type='pandas',
270
+ # datatype=[s.type_map[x] for x in headers],
271
+ # interactive=False,
272
+ # visible=True)
273
+ # return comp
274
+
275
+ # for cbox in [s.checkbox_group, s.model_size, s.model_type]:
276
+ # cbox.change(
277
+ # fn=filter_df_l2,
278
+ # inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
279
+ # outputs=s.data_component)
280
 
281
  with gr.Row():
282
  with gr.Accordion('Citation', open=False):
 
285
  label=CITATION_BUTTON_LABEL,
286
  elem_id='citation-button')
287
 
288
+
289
  if __name__ == '__main__':
290
+ import argparse
291
+
292
+ parser = argparse.ArgumentParser()
293
+ parser.add_argument("--host", type=str, default="0.0.0.0")
294
+ parser.add_argument("--port", type=int)
295
+ parser.add_argument(
296
+ "--share",
297
+ action="store_true",
298
+ help="Whether to generate a public, shareable link",
299
+ )
300
+ parser.add_argument(
301
+ "--concurrency-count",
302
+ type=int,
303
+ default=10,
304
+ help="The concurrency count of the gradio queue",
305
+ )
306
+ parser.add_argument(
307
+ "--max-threads",
308
+ type=int,
309
+ default=200,
310
+ help="The maximum number of threads available to process non-async functions.",
311
+ )
312
+ # parser.add_argument(
313
+ # "--gradio-auth-path",
314
+ # type=str,
315
+ # help='Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3"',
316
+ # default=None,
317
+ # )
318
+ parser.add_argument(
319
+ "--gradio-root-path",
320
+ type=str,
321
+ help="Sets the gradio root path, eg /abc/def. Useful when running behind a reverse-proxy or at a custom URL path prefix",
322
+ )
323
+ parser.add_argument(
324
+ "--ga-id",
325
+ type=str,
326
+ help="the Google Analytics ID",
327
+ default=None,
328
+ )
329
+ parser.add_argument(
330
+ "--use-remote-storage",
331
+ action="store_true",
332
+ default=False,
333
+ help="Uploads image files to google cloud storage if set to true",
334
+ )
335
+ args = parser.parse_args()
336
+ logger.info(f"args: {args}")
337
+
338
+ # Set authorization credentials
339
+ # auth = None
340
+ # if args.gradio_auth_path is not None:
341
+ # auth = parse_gradio_auth_creds(args.gradio_auth_path)
342
+
343
+ demo.queue(
344
+ default_concurrency_limit=args.concurrency_count,
345
+ status_update_rate=10,
346
+ api_open=False,
347
+ ).launch(
348
+ server_name=args.host,
349
+ server_port=args.port,
350
+ share=args.share,
351
+ max_threads=args.max_threads,
352
+ # auth=auth,
353
+ root_path=args.gradio_root_path,
354
+ # debug=True,
355
+ show_error=True,
356
+ allowed_paths=["../.."]
357
+ )
commands.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ srun \
2
+ --partition=llmeval \
3
+ --quotatype=reserved \
4
+ --job-name=judgerbench_server \
5
+ --ntasks=1 \
6
+ --ntasks-per-node=1 \
7
+ --cpus-per-task=2 \
8
+ --kill-on-bad-exit=1 \
9
+ --pty bash
10
+
11
+
12
+ bash -i start_gradio_web_server.sh
data/detail_a_cn.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
2
+ CJ-1-14B,0.69,0.61,0.51,0.55,0.71,0.68,0.6,0.58,0.61,0.65,0.619,Judge
3
+ GPT-4o-0806,0.77,0.56,0.51,0.53,0.67,0.66,0.63,0.58,0.62,0.58,0.611,API
4
+ CJ-1-32B,0.69,0.58,0.53,0.52,0.71,0.53,0.6,0.61,0.61,0.69,0.607,Judge
5
+ Skywork-llama3.1-8B,0.62,0.58,0.58,0.59,0.63,0.58,0.6,0.61,0.6,0.61,0.6,Judge
6
+ Qwen2.5-72B-Chat,0.65,0.47,0.49,0.47,0.71,0.6,0.57,0.58,0.69,0.6,0.583,General
7
+ CJ-1-7B,0.62,0.54,0.41,0.58,0.7,0.6,0.59,0.56,0.59,0.6,0.579,Judge
8
+ Qwen2-72B-Chat,0.62,0.54,0.34,0.55,0.68,0.63,0.58,0.58,0.62,0.64,0.578,General
9
+ Selftaught-llama3.1-70B,0.62,0.56,0.55,0.48,0.67,0.55,0.57,0.57,0.51,0.61,0.569,Judge
10
+ Qwen2.5-7B-Chat,0.46,0.58,0.36,0.45,0.7,0.53,0.52,0.53,0.52,0.64,0.529,General
11
+ CJ-1-1.5B,0.54,0.58,0.38,0.38,0.62,0.63,0.54,0.52,0.55,0.54,0.528,Judge
data/detail_a_en.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
2
+ GPT-4o-0806,0.82,0.53,0.62,0.61,0.83,0.67,0.67,0.73,0.64,0.55,0.667,API
3
+ Skywork-llama3.1-8B,0.69,0.61,0.54,0.62,0.63,0.64,0.6,0.69,0.74,0.53,0.629,Judge
4
+ Qwen2.5-72B-Chat,0.68,0.57,0.57,0.47,0.78,0.64,0.58,0.75,0.61,0.52,0.617,General
5
+ CJ-1-32B,0.66,0.57,0.56,0.59,0.78,0.58,0.55,0.75,0.6,0.49,0.613,Judge
6
+ CJ-1-14B,0.66,0.51,0.57,0.54,0.72,0.61,0.56,0.74,0.61,0.47,0.599,Judge
7
+ Qwen2-72B-Chat,0.63,0.59,0.54,0.49,0.62,0.64,0.6,0.74,0.51,0.52,0.588,General
8
+ CJ-1-7B,0.56,0.56,0.51,0.47,0.68,0.58,0.58,0.75,0.58,0.43,0.57,Judge
9
+ Qwen2.5-7B-Chat,0.54,0.59,0.59,0.46,0.69,0.43,0.61,0.65,0.58,0.52,0.566,General
10
+ CJ-1-1.5B,0.42,0.56,0.56,0.43,0.66,0.47,0.55,0.78,0.64,0.44,0.551,Judge
11
+ Selftaught-llama3.1-70B,0.47,0.45,0.47,0.37,0.45,0.43,0.36,0.58,0.48,0.36,0.442,Judge
data/detail_b_acc.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
2
+ CJ-1-32B,0.857,0.806,0.596,0.621,0.72,Judge
3
+ CJ-1-14B,0.839,0.787,0.566,0.602,0.699,Judge
4
+ CJ-1-7B,0.816,0.783,0.564,0.586,0.687,Judge
5
+ Qwen2.5-72B-Chat,0.878,0.677,0.599,0.57,0.681,General
6
+ CJ-1-1.5B,0.822,0.712,0.55,0.43,0.629,Judge
7
+ Qwen2-72B-Chat,0.867,0.692,0.564,0.376,0.625,General
8
+ Selftaught-llama3.1-70B,0.755,0.627,0.538,0.472,0.598,Judge
9
+ Qwen2.5-7B-Chat,0.777,0.67,0.47,0.444,0.59,General
data/detail_b_corr.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
2
+ CJ-1-32B,0.973,0.951,0.954,0.975,0.963,Judge
3
+ CJ-1-14B,0.966,0.956,0.965,0.951,0.959,Judge
4
+ CJ-1-7B,0.956,0.936,0.97,0.932,0.948,Judge
5
+ Qwen2.5-72B-Chat,0.964,0.916,0.958,0.912,0.937,General
6
+ Qwen2-72B-Chat,0.937,0.889,0.976,0.936,0.935,General
7
+ CJ-1-1.5B,0.928,0.851,0.981,0.858,0.905,Judge
8
+ Qwen2.5-7B-Chat,0.916,0.681,0.967,0.931,0.874,General
9
+ Selftaught-llama3.1-70B,0.918,0.667,0.95,0.942,0.869,Judge
data/overall.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Models,JDB-A EN,JDB-A CN,JDB-B Acc,JDB-B Corr,JudgerBench,Class
2
+ GPT-4o-0806,0.664,0.608,1,1,0.818,API
3
+ CJ-1-32B,0.614,0.612,0.72,0.963,0.727,Judge
4
+ CJ-1-14B,0.599,0.615,0.699,0.959,0.718,Judge
5
+ Qwen2.5-72B-Chat,0.615,0.59,0.681,0.937,0.706,General
6
+ CJ-1-7B,0.57,0.583,0.687,0.948,0.697,Judge
7
+ Qwen2-72B-Chat,0.588,0.584,0.625,0.935,0.683,General
8
+ CJ-1-1.5B,0.553,0.527,0.629,0.905,0.654,Judge
9
+ Qwen2.5-7B-Chat,0.567,0.535,0.59,0.874,0.641,General
10
+ Selftaught-llama3.1-70B,0.443,0.57,0.598,0.869,0.62,Judge
11
+ Skywork-llama3.1-8B,0.63,0.605,-,-,-,Judge
judgerbench/__init__.py ADDED
File without changes
meta_data.py → judgerbench/meta_data.py RENAMED
@@ -1,3 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # CONSTANTS-URL
2
  URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
3
  VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
@@ -9,32 +31,59 @@ CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
9
  year={2023}
10
  }"""
11
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
12
  # CONSTANTS-TEXT
13
- LEADERBORAD_INTRODUCTION = """# OpenVLM Leaderboard
14
- ### Welcome to the OpenVLM Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework:
15
- ### [*VLMEvalKit*: A Toolkit for Evaluating Large Vision-Language Models](https://github.com/open-compass/VLMEvalKit) 🏆
16
- ### Currently, OpenVLM Leaderboard covers {} different VLMs (including GPT-4v, Gemini, QwenVLPlus, LLaVA, etc.) and {} different multi-modal benchmarks.
17
 
18
- This leaderboard was last updated: {}.
19
 
20
- OpenVLM Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass, duanhaodong]@pjlab.org.cn.
21
  """
 
22
  # CONSTANTS-FIELDS
23
- META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
 
 
 
 
 
 
 
 
 
 
24
  MAIN_FIELDS = [
25
- 'MMBench_V11', 'MMStar', 'MME',
26
- 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
27
- 'HallusionBench', 'SEEDBench_IMG', 'MMVet',
28
- 'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
29
- 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
30
  ]
 
 
 
 
31
  DEFAULT_BENCH = [
32
- 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
33
- 'HallusionBench', 'MMVet'
 
 
 
34
  ]
35
- MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
36
- MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
37
- MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # The README file for each benchmark
40
  LEADERBOARD_MD = {}
 
1
+ from enum import IntEnum
2
+ from pathlib import Path
3
+ import os
4
+
5
+ # REPO paths
6
+ REPO_PATH = Path(os.path.dirname(os.path.dirname(__file__)))
7
+ DATADIR = REPO_PATH / Path(os.getenv("DATADIR", "./data"))
8
+
9
+ LEADERBOARD_FILE_MAPPING = dict(
10
+ overall="overall.csv",
11
+ detail_a_cn="detail_a_cn.csv",
12
+ detail_a_en="detail_a_en.csv",
13
+ detail_b_acc="detail_b_acc.csv",
14
+ detail_b_corr="detail_b_corr.csv",
15
+ )
16
+
17
+ STYLE_CLASS_MAPPING = {
18
+ "API": '#82e0aa',
19
+ "Judge": '#f8c471',
20
+ "General": '#85c1e9',
21
+ }
22
+
23
  # CONSTANTS-URL
24
  URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
25
  VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
 
31
  year={2023}
32
  }"""
33
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
34
+
35
  # CONSTANTS-TEXT
36
+ LEADERBORAD_INTRODUCTION = """# JudgerBench Leaderboard
 
 
 
37
 
38
+ ### Welcome to the JudgerBench Leaderboard!
39
 
40
+ This leaderboard was last updated: {}.
41
  """
42
+
43
  # CONSTANTS-FIELDS
44
+ # META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
45
+ META_FIELDS = [
46
+ 'Model',
47
+ ]
48
+ # MAIN_FIELDS = [
49
+ # 'MMBench_V11', 'MMStar', 'MME',
50
+ # 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
51
+ # 'HallusionBench', 'SEEDBench_IMG', 'MMVet',
52
+ # 'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
53
+ # 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
54
+ # ]
55
  MAIN_FIELDS = [
56
+ 'Average',
57
+ 'Accuracy',
58
+ 'Accuracy_CN',
59
+ 'Accuracy_EN',
60
+ 'Corr',
61
  ]
62
+ # DEFAULT_BENCH = [
63
+ # 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
64
+ # 'HallusionBench', 'MMVet'
65
+ # ]
66
  DEFAULT_BENCH = [
67
+ 'Average',
68
+ 'Accuracy',
69
+ 'Accuracy_CN',
70
+ 'Accuracy_EN',
71
+ 'Corr',
72
  ]
73
+
74
+ FIELD_MAPPING = {
75
+ 'model': 'Model',
76
+ 'parameters': 'Param (B)',
77
+ 'average': 'Average',
78
+ 'accuracy': 'Accuracy',
79
+ 'accuracy_cn': 'Accuracy_CN',
80
+ 'accuracy_en': 'Accuracy_EN',
81
+ 'corr': 'Corr',
82
+ }
83
+
84
+ # MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
85
+ # MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
86
+ # MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
87
 
88
  # The README file for each benchmark
89
  LEADERBOARD_MD = {}
judgerbench/preprocess/__init__.py ADDED
File without changes
gen_table.py → judgerbench/preprocess/gen_table.py RENAMED
@@ -6,9 +6,19 @@ from urllib.request import urlopen
6
  import gradio as gr
7
  import numpy as np
8
  import pandas as pd
 
 
 
9
 
10
- from meta_data import DEFAULT_BENCH, META_FIELDS, URL
11
-
 
 
 
 
 
 
 
12
 
13
  def listinstr(lst, s):
14
  assert isinstance(lst, list)
@@ -18,7 +28,7 @@ def listinstr(lst, s):
18
  return False
19
 
20
 
21
- def load_results():
22
  data = json.loads(urlopen(URL).read())
23
  return data
24
 
@@ -61,23 +71,35 @@ def model_type_flag(line, FIELDS):
61
  return False
62
 
63
 
64
- def BUILD_L1_DF(results, fields):
65
  check_box = {}
66
- check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model']
 
 
 
 
67
  # revise there to set default dataset
68
- check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
69
- check_box['avg'] = ['Avg Score', 'Avg Rank']
70
- check_box['all'] = check_box['avg'] + fields
 
 
 
71
  type_map = defaultdict(lambda: 'number')
72
- type_map['Method'] = 'html'
73
- type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
 
 
 
 
 
74
  check_box['type_map'] = type_map
75
 
76
- df = generate_table(results, fields)
77
  return df, check_box
78
 
79
 
80
- def BUILD_L2_DF(results, dataset):
81
  res = defaultdict(list)
82
  sub = [v for v in results.values() if dataset in v]
83
  assert len(sub)
@@ -137,7 +159,7 @@ def BUILD_L2_DF(results, dataset):
137
  return df, check_box
138
 
139
 
140
- def generate_table(results, fields):
141
 
142
  def get_mmbench_v11(item):
143
  assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
@@ -194,3 +216,60 @@ def generate_table(results, fields):
194
  missing = missing.iloc[::-1]
195
  df = pd.concat([valid, missing])
196
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import gradio as gr
7
  import numpy as np
8
  import pandas as pd
9
+ from pathlib import Path
10
+ from typing import Union, List, Dict
11
+ from loguru import logger
12
 
13
+ from judgerbench.meta_data import (
14
+ DATADIR,
15
+ LEADERBOARD_FILE_MAPPING,
16
+ DEFAULT_BENCH,
17
+ FIELD_MAPPING,
18
+ STYLE_CLASS_MAPPING,
19
+ META_FIELDS,
20
+ URL
21
+ )
22
 
23
  def listinstr(lst, s):
24
  assert isinstance(lst, list)
 
28
  return False
29
 
30
 
31
+ def load_results_from_url():
32
  data = json.loads(urlopen(URL).read())
33
  return data
34
 
 
71
  return False
72
 
73
 
74
+ def build_l1_df(fields):
75
  check_box = {}
76
+ check_box['essential'] = [
77
+ # 'Method',
78
+ # 'Param (B)',
79
+ 'Model',
80
+ ]
81
  # revise there to set default dataset
82
+ check_box['default'] = DEFAULT_BENCH
83
+
84
+ check_box['avg'] = ['Average']
85
+ check_box['accuracy'] = ['Accuracy_CN', 'Accuracy_EN', 'Accuracy',]
86
+ check_box['all'] = fields
87
+
88
  type_map = defaultdict(lambda: 'number')
89
+ # type_map['Method'] = 'html'
90
+ type_map['Model'] = 'str'
91
+ # type_map['Language Model'] = 'str'
92
+ # type_map['Vision Model'] = 'str'
93
+ # type_map['OpenSource'] = 'str'
94
+ # type_map['Verified'] = 'str'
95
+
96
  check_box['type_map'] = type_map
97
 
98
+ df = generate_table(fields)
99
  return df, check_box
100
 
101
 
102
+ def build_l2_df(results, dataset):
103
  res = defaultdict(list)
104
  sub = [v for v in results.values() if dataset in v]
105
  assert len(sub)
 
159
  return df, check_box
160
 
161
 
162
+ def generate_table1(results, fields):
163
 
164
  def get_mmbench_v11(item):
165
  assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
 
216
  missing = missing.iloc[::-1]
217
  df = pd.concat([valid, missing])
218
  return df
219
+
220
+
221
+ def generate_table(
222
+ fields: List[str] = None,
223
+ filename: str = None,
224
+ path: Union[str, Path] = DATADIR / "overall.csv",
225
+ ):
226
+
227
+ if filename in LEADERBOARD_FILE_MAPPING:
228
+ path = DATADIR / LEADERBOARD_FILE_MAPPING[filename]
229
+
230
+ if filename is None and path is None:
231
+ raise ValueError("filename and path cannot both be None.")
232
+
233
+ REQUIRED_FILEDS = META_FIELDS + [
234
+ # 'Average'
235
+ ]
236
+
237
+ df = pd.read_csv(path)
238
+ # df_reshaped = (
239
+ # df
240
+ # .drop(columns=["dataset", "mode", "version"])
241
+ # .melt(
242
+ # id_vars=["metric"],
243
+ # var_name="model",
244
+ # value_name="value"
245
+ # )
246
+ # .pivot(index=["model"], columns=["metric"], values='value')
247
+ # )
248
+ # df_reshaped.columns.name = None
249
+ # df_reshaped.reset_index(inplace=True)
250
+ # df_reshaped.rename(columns=FIELD_MAPPING, inplace=True)
251
+
252
+ # if fields is not None:
253
+ # for field in fields:
254
+ # if field not in df_reshaped.columns:
255
+ # raise ValueError(f"{field} is not a valid field in leaderboard table.")
256
+
257
+ # new_fields = [field for field in FIELD_MAPPING.values() if field in REQUIRED_FILEDS + fields]
258
+ # logger.info(f"{new_fields=}")
259
+
260
+ # df_reshaped = df_reshaped.loc[:,new_fields].copy()
261
+
262
+ # valid, missing = df_reshaped[~pd.isna(df_reshaped['Average'])], df_reshaped[pd.isna(df_reshaped['Average'])]
263
+ # valid = valid.sort_values('Average', ascending=False)
264
+
265
+ # if len(fields):
266
+ # missing = missing.sort_values(
267
+ # 'Accuracy' if 'Accuracy' in fields else fields[0],
268
+ # ascending=False,
269
+ # )
270
+
271
+ # df_sorted = pd.concat([valid, missing])
272
+
273
+ df_sorted = df
274
+
275
+ return df_sorted
judgerbench/preprocess/generate_table.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ import pandas as pd
3
+
4
+ from judgerbench.meta_data import (
5
+ DATADIR
6
+ )
7
+
8
+ #%%
9
+ df = pd.read_csv(DATADIR / "summary_20241007_221023.csv")
10
+ df
11
+
12
+ # %%
13
+ df_reshaped = (
14
+ df
15
+ .drop(columns=["dataset", "mode", "version"])
16
+ .melt(
17
+ id_vars=["metric"],
18
+ var_name="model",
19
+ value_name="value"
20
+ )
21
+ .pivot(index=["model"], columns=["metric"], values='value')
22
+ )
23
+ df_reshaped.columns.name = None
24
+ df_reshaped.reset_index(inplace=True)
25
+
26
+ df_reshaped
27
+ # %%
pyproject.toml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools >= 64"]
3
+ build-backend = "setuptools.build_meta"
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- gradio==4.15.0
2
  numpy>=1.23.4
3
  pandas>=1.5.3
 
1
+ gradio==5.1
2
  numpy>=1.23.4
3
  pandas>=1.5.3
setup.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Python setup.py for project_name package"""
2
+ import io
3
+ import os
4
+ import re
5
+ from setuptools import find_packages, setup
6
+
7
+
8
+ def read(*paths, **kwargs):
9
+ """Read the contents of a text file safely.
10
+ >>> read("project_name", "VERSION")
11
+ '0.1.0'
12
+ >>> read("README.md")
13
+ ...
14
+ """
15
+
16
+ content = ""
17
+ with io.open(
18
+ os.path.join(os.path.dirname(__file__), *paths),
19
+ encoding=kwargs.get("encoding", "utf8"),
20
+ ) as open_file:
21
+ content = open_file.read().strip()
22
+ return content
23
+
24
+
25
+ def read_requirements(path):
26
+ return [
27
+ line.strip()
28
+ for line in read(path).split("\n")
29
+ if not line.startswith(('"', "#", "-", "git+"))
30
+ ]
31
+
32
+
33
+ setup(
34
+ name="judgerbench",
35
+ version='v0.0.1',
36
+ description="Judger Bench",
37
+ url="https://huggingface.co/spaces/acylam/judgerbench_leaderboard",
38
+ long_description=read("README.md"),
39
+ # long_description_content_type="text/markdown",
40
+ author="linjunyao",
41
+ maintainer="linjunyao",
42
+ package_dir={"": "judgerbench"},
43
+ packages=find_packages(
44
+ where="judgerbench",
45
+ include=["judgerbench", "judgerbench/**/*"],
46
+ exclude=["tests", ".github"]
47
+ ),
48
+ install_requires=read_requirements("requirements.txt"),
49
+ # entry_points={
50
+ # "console_scripts": ["project_name = project_name.__main__:main"]
51
+ # },
52
+ # extras_require={"test": read_requirements("requirements-test.txt")},
53
+ )
start_gradio_web_server.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conda activate judgerbench
2
+
3
+ # GRADIO_HOSTNAME=0.0.0.0
4
+ GRADIO_HOSTNAME=$(hostname)
5
+ GRADIO_PORT=7861
6
+
7
+ # Set tmp and logs folders for gradio
8
+ export TMPDIR="tmp"
9
+ export LOGDIR="logs"
10
+ export GRADIO_SERVER_NAME="0.0.0.0"
11
+ export no_proxy="$CONTROLLER_HOST,10.140.1.173,0.0.0.0,$(hostname)"
12
+
13
+ # OpenAI proxy url
14
+ # export OPENAI_PROXY_URL='http://10.1.20.57:23128'
15
+ export OPENAI_PROXY_URL='http://closeai-proxy.pjlab.org.cn:23128'
16
+
17
+ # Source api keys
18
+ # source set_api_keys.sh
19
+
20
+ python3 -m app \
21
+ --host $GRADIO_HOSTNAME \
22
+ --port $GRADIO_PORT \
23
+ --concurrency-count 50