Junming Yang commited on
Commit
a570ac2
1 Parent(s): 826f617

[Leaderboard] Support leaderboard dynamic avg score calculation (#193)

Browse files

* add VQA meta_data

* Support leaderboard dynamic avg score calculation

Files changed (3) hide show
  1. app.py +3 -0
  2. gen_table.py +50 -33
  3. meta_data.py +2 -2
app.py CHANGED
@@ -52,7 +52,9 @@ with gr.Blocks() as demo:
52
  visible=True)
53
 
54
  def filter_df(fields, model_size, model_type):
 
55
  headers = check_box['essential'] + fields
 
56
  df = cp.deepcopy(table)
57
  df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
58
  df = df[df['flag']]
@@ -62,6 +64,7 @@ with gr.Blocks() as demo:
62
  df = df[df['flag']]
63
  df.pop('flag')
64
 
 
65
  comp = gr.components.DataFrame(
66
  value=df[headers],
67
  type='pandas',
 
52
  visible=True)
53
 
54
  def filter_df(fields, model_size, model_type):
55
+ filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
56
  headers = check_box['essential'] + fields
57
+ new_fields = [field for field in fields if field not in filter_list]
58
  df = cp.deepcopy(table)
59
  df['flag'] = [model_size_flag(x, model_size) for x in df['Parameters (B)']]
60
  df = df[df['flag']]
 
64
  df = df[df['flag']]
65
  df.pop('flag')
66
 
67
+ df = generate_table(results, new_fields, df)
68
  comp = gr.components.DataFrame(
69
  value=df[headers],
70
  type='pandas',
gen_table.py CHANGED
@@ -60,45 +60,22 @@ def model_type_flag(line, FIELDS):
60
 
61
 
62
  def BUILD_L1_DF(results, fields):
63
- res = defaultdict(list)
64
- for i, m in enumerate(results):
65
- item = results[m]
66
- meta = item['META']
67
- for k in META_FIELDS:
68
- if k == 'Parameters (B)':
69
- param = meta['Parameters']
70
- res[k].append(float(param.replace('B', '')) if param != '' else None)
71
- elif k == 'Method':
72
- name, url = meta['Method']
73
- res[k].append(f'<a href="{url}">{name}</a>')
74
- else:
75
- res[k].append(meta[k])
76
- scores, ranks = [], []
77
- for d in fields:
78
- key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
79
- res[d].append(item[d][key_name])
80
- if d == 'MME':
81
- scores.append(item[d][key_name] / 28)
82
- elif d == 'OCRBench':
83
- scores.append(item[d][key_name] / 10)
84
- else:
85
- scores.append(item[d][key_name])
86
- ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values()]))
87
- res['Avg Score'].append(round(np.mean(scores), 1))
88
- res['Avg Rank'].append(round(np.mean(ranks), 2))
89
-
90
- df = pd.DataFrame(res)
91
- df = df.sort_values('Avg Score')
92
- df = df.iloc[::-1]
93
-
94
  check_box = {}
95
  check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
96
- check_box['required'] = ['Avg Score', 'Avg Rank']
97
- check_box['all'] = check_box['required'] + ['OpenSource', 'Verified'] + fields
 
 
 
98
  type_map = defaultdict(lambda: 'number')
99
  type_map['Method'] = 'html'
100
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
101
  check_box['type_map'] = type_map
 
 
 
 
 
102
  return df, check_box
103
 
104
 
@@ -153,3 +130,43 @@ def BUILD_L2_DF(results, dataset):
153
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
154
  check_box['type_map'] = type_map
155
  return df, check_box
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
  def BUILD_L1_DF(results, fields):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  check_box = {}
64
  check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
65
+ # revise there to set defualt dataset
66
+ defualt_dataset = ['MMBench_TEST_EN', 'MMStar', 'MME', 'MMMU_VAL', 'MathVista', 'OCRBench', 'MMVet']
67
+ check_box['required'] = ['Avg Score', 'Avg Rank'] + defualt_dataset
68
+ check_box['avg'] = ['Avg Score', 'Avg Rank']
69
+ check_box['all'] = check_box['avg'] + fields
70
  type_map = defaultdict(lambda: 'number')
71
  type_map['Method'] = 'html'
72
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
73
  check_box['type_map'] = type_map
74
+
75
+ res = generate_table(results, fields)
76
+ df = pd.DataFrame(res)
77
+ df = df.sort_values('Avg Score')
78
+ df = df.iloc[::-1]
79
  return df, check_box
80
 
81
 
 
130
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
131
  check_box['type_map'] = type_map
132
  return df, check_box
133
+
134
+
135
+ def generate_table(results, fields, df=None):
136
+ res = defaultdict(list)
137
+ for i, m in enumerate(results):
138
+ item = results[m]
139
+ meta = item['META']
140
+ for k in META_FIELDS:
141
+ if k == 'Parameters (B)':
142
+ param = meta['Parameters']
143
+ res[k].append(float(param.replace('B', '')) if param != '' else None)
144
+ elif k == 'Method':
145
+ name, url = meta['Method']
146
+ res[k].append(f'<a href="{url}">{name}</a>')
147
+ res['name'].append(name)
148
+ else:
149
+ res[k].append(meta[k])
150
+ scores, ranks = [], []
151
+ for d in fields:
152
+ key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
153
+ res[d].append(item[d][key_name])
154
+ if d == 'MME':
155
+ scores.append(item[d][key_name] / 28)
156
+ elif d == 'OCRBench':
157
+ scores.append(item[d][key_name] / 10)
158
+ else:
159
+ scores.append(item[d][key_name])
160
+ ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values()]))
161
+ res['Avg Score'].append(round(np.mean(scores), 1))
162
+ res['Avg Rank'].append(round(np.mean(ranks), 2))
163
+ if df is None:
164
+ return res
165
+ else:
166
+ res = pd.DataFrame(res)
167
+ df.set_index('name', inplace=True)
168
+ res.set_index('name', inplace=True)
169
+ df.update(res)
170
+ df = df.sort_values('Avg Score')
171
+ df = df.iloc[::-1]
172
+ return df
meta_data.py CHANGED
@@ -21,8 +21,8 @@ This leaderboard was last updated: {}.
21
  META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
22
  MAIN_FIELDS = [
23
  'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
24
- 'MMMU_VAL', 'MathVista', 'HallusionBench', 'AI2D',
25
- 'OCRBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
26
  ]
27
  MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
28
  MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
 
21
  META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
22
  MAIN_FIELDS = [
23
  'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MMStar', 'MME',
24
+ 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
25
+ 'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench'
26
  ]
27
  MMBENCH_FIELDS = ['MMBench_TEST_EN', 'MMBench_DEV_EN', 'MMBench_TEST_CN', 'MMBench_DEV_CN', 'CCBench']
28
  MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']