linjunyao
commited on
Commit
•
0bb476f
1
Parent(s):
77c1fdd
added leaderboard data; added Class coloring
Browse files- .gitignore +50 -0
- app.py +328 -136
- commands.sh +12 -0
- data/detail_a_cn.csv +11 -0
- data/detail_a_en.csv +11 -0
- data/detail_b_acc.csv +9 -0
- data/detail_b_corr.csv +9 -0
- data/overall.csv +11 -0
- judgerbench/__init__.py +0 -0
- meta_data.py → judgerbench/meta_data.py +66 -17
- judgerbench/preprocess/__init__.py +0 -0
- gen_table.py → judgerbench/preprocess/gen_table.py +92 -13
- judgerbench/preprocess/generate_table.py +27 -0
- pyproject.toml +3 -0
- requirements.txt +1 -1
- setup.py +53 -0
- start_gradio_web_server.sh +23 -0
.gitignore
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/**/*_bkup.*
|
2 |
+
# Python
|
3 |
+
__pycache__
|
4 |
+
.ipynb_checkpoints/
|
5 |
+
*.pyc
|
6 |
+
*.egg-info
|
7 |
+
dist
|
8 |
+
.venv
|
9 |
+
|
10 |
+
# Log
|
11 |
+
*.log
|
12 |
+
*.log.*
|
13 |
+
logs/controller/*
|
14 |
+
logs/conversation/*
|
15 |
+
logs/gradio_web_server/*
|
16 |
+
logs/gradio_web_server_multi/*
|
17 |
+
!logs/**/.keep
|
18 |
+
*.json
|
19 |
+
!playground/deepspeed_config_s2.json
|
20 |
+
!playground/deepspeed_config_s3.json
|
21 |
+
|
22 |
+
# Editor
|
23 |
+
.idea
|
24 |
+
*.swp
|
25 |
+
|
26 |
+
# Other
|
27 |
+
.DS_Store
|
28 |
+
wandb
|
29 |
+
output
|
30 |
+
checkpoints_flant5_3b
|
31 |
+
|
32 |
+
# Data
|
33 |
+
*.pkl
|
34 |
+
tests/state_of_the_union.txt
|
35 |
+
|
36 |
+
# Build
|
37 |
+
build
|
38 |
+
|
39 |
+
# Gradio Temp
|
40 |
+
tmp
|
41 |
+
|
42 |
+
# API KEYS
|
43 |
+
set_api_keys.sh
|
44 |
+
|
45 |
+
# data
|
46 |
+
|
47 |
+
|
48 |
+
!**/.keep
|
49 |
+
|
50 |
+
archive/*
|
app.py
CHANGED
@@ -1,158 +1,282 @@
|
|
1 |
import abc
|
2 |
|
3 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
from gen_table import *
|
6 |
-
from meta_data import *
|
7 |
|
8 |
with gr.Blocks() as demo:
|
9 |
-
struct = load_results()
|
10 |
-
timestamp = struct['time']
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
)
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
type='pandas',
|
79 |
datatype=[type_map[x] for x in headers],
|
80 |
interactive=False,
|
81 |
-
visible=True
|
82 |
-
return comp
|
83 |
-
|
84 |
-
for cbox in [checkbox_group, model_size, model_type]:
|
85 |
-
cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
|
86 |
-
|
87 |
-
with gr.TabItem('🔍 About', elem_id='about', id=1):
|
88 |
-
gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
|
89 |
-
|
90 |
-
for i, dataset in enumerate(DATASETS):
|
91 |
-
with gr.TabItem(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
|
92 |
-
if dataset in LEADERBOARD_MD:
|
93 |
-
gr.Markdown(LEADERBOARD_MD[dataset])
|
94 |
-
|
95 |
-
s = structs[i]
|
96 |
-
s.table, s.check_box = BUILD_L2_DF(results, dataset)
|
97 |
-
s.type_map = s.check_box['type_map']
|
98 |
-
s.type_map['Rank'] = 'number'
|
99 |
-
|
100 |
-
s.checkbox_group = gr.CheckboxGroup(
|
101 |
-
choices=s.check_box['all'],
|
102 |
-
value=s.check_box['required'],
|
103 |
-
label=f'{dataset} CheckBoxes',
|
104 |
-
interactive=True,
|
105 |
)
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
def filter_df_l2(dataset_name, fields, model_size, model_type):
|
131 |
-
s = structs[DATASETS.index(dataset_name)]
|
132 |
-
headers = ['Rank'] + s.check_box['essential'] + fields
|
133 |
-
df = cp.deepcopy(s.table)
|
134 |
-
df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
|
135 |
-
df = df[df['flag']]
|
136 |
-
df.pop('flag')
|
137 |
-
if len(df):
|
138 |
-
df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
|
139 |
-
df = df[df['flag']]
|
140 |
-
df.pop('flag')
|
141 |
df['Rank'] = list(range(1, len(df) + 1))
|
142 |
|
143 |
-
comp = gr.
|
144 |
value=df[headers],
|
145 |
type='pandas',
|
146 |
-
datatype=[
|
147 |
interactive=False,
|
148 |
-
visible=True
|
|
|
|
|
149 |
return comp
|
150 |
|
151 |
-
for cbox in [
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
with gr.Row():
|
158 |
with gr.Accordion('Citation', open=False):
|
@@ -161,5 +285,73 @@ with gr.Blocks() as demo:
|
|
161 |
label=CITATION_BUTTON_LABEL,
|
162 |
elem_id='citation-button')
|
163 |
|
|
|
164 |
if __name__ == '__main__':
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import abc
|
2 |
|
3 |
import gradio as gr
|
4 |
+
from loguru import logger
|
5 |
+
import pandas as pd
|
6 |
+
from collections import defaultdict
|
7 |
+
|
8 |
+
from judgerbench.preprocess.gen_table import (
|
9 |
+
format_timestamp,
|
10 |
+
generate_table,
|
11 |
+
build_l1_df,
|
12 |
+
# build_l2_df,
|
13 |
+
)
|
14 |
+
from judgerbench.meta_data import (
|
15 |
+
LEADERBORAD_INTRODUCTION,
|
16 |
+
LEADERBOARD_MD,
|
17 |
+
LEADERBOARD_FILE_MAPPING,
|
18 |
+
MAIN_FIELDS,
|
19 |
+
DEFAULT_BENCH,
|
20 |
+
STYLE_CLASS_MAPPING,
|
21 |
+
CITATION_BUTTON_TEXT,
|
22 |
+
CITATION_BUTTON_LABEL,
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
def refresh_dataframe(required_fields):
|
27 |
+
df = generate_table(MAIN_FIELDS)
|
28 |
+
|
29 |
+
comp = gr.DataFrame(
|
30 |
+
value=df,
|
31 |
+
type='pandas',
|
32 |
+
interactive=False,
|
33 |
+
visible=True
|
34 |
+
)
|
35 |
+
|
36 |
+
return comp
|
37 |
|
|
|
|
|
38 |
|
39 |
with gr.Blocks() as demo:
|
40 |
+
# struct = load_results()
|
41 |
+
# timestamp = struct['time']
|
42 |
+
|
43 |
+
# EVAL_TIME = format_timestamp(timestamp)
|
44 |
+
EVAL_TIME = '20241015'
|
45 |
+
|
46 |
+
# results = struct['results']
|
47 |
+
# N_MODEL = len(results)
|
48 |
+
# N_DATA = len(results['LLaVA-v1.5-7B']) - 1
|
49 |
+
|
50 |
+
N_MODEL = 10
|
51 |
+
N_DATA = 100
|
52 |
+
|
53 |
+
# DATASETS = list(results['LLaVA-v1.5-7B'])
|
54 |
+
# DATASETS.remove('META')
|
55 |
+
# print(DATASETS)
|
56 |
+
|
57 |
+
gr.Markdown(LEADERBORAD_INTRODUCTION.format(
|
58 |
+
# N_MODEL,
|
59 |
+
# N_DATA,
|
60 |
+
EVAL_TIME
|
61 |
+
))
|
62 |
+
# structs = [abc.abstractproperty() for _ in range(N_DATA)]
|
63 |
|
64 |
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
65 |
+
for cur_id, (filename, filepath) in enumerate(LEADERBOARD_FILE_MAPPING.items()):
|
66 |
+
|
67 |
+
tab_name = filename
|
68 |
+
if filename == "overall":
|
69 |
+
tab_name = '🏅 JudgerBench Main Leaderboard'
|
70 |
+
|
71 |
+
with gr.Tab(tab_name, elem_id=f'tab_{cur_id}', id=cur_id):
|
72 |
+
|
73 |
+
# gr.Markdown(LEADERBOARD_MD['MAIN'])
|
74 |
+
# _, check_box = build_l1_df(MAIN_FIELDS)
|
75 |
+
table = generate_table(filename=filename)
|
76 |
+
|
77 |
+
# type_map = check_box['type_map']
|
78 |
+
type_map = defaultdict(lambda: 'number')
|
79 |
+
type_map['Model'] = 'str'
|
80 |
+
type_map['Class'] = 'str'
|
81 |
+
type_map['Rank'] = 'number'
|
82 |
+
|
83 |
+
# required_fields = gr.State(
|
84 |
+
# check_box['essential']
|
85 |
+
# # + ["Average"]
|
86 |
+
# )
|
87 |
+
|
88 |
+
# checkbox_group = gr.CheckboxGroup(
|
89 |
+
# choices=[item for item in check_box['all'] if item not in required_fields.value],
|
90 |
+
# value=[item for item in check_box['default'] if item not in required_fields.value],
|
91 |
+
# label='Evaluation Metrics',
|
92 |
+
# interactive=True,
|
93 |
+
# )
|
94 |
+
|
95 |
+
# headers = (
|
96 |
+
# ['Rank'] +
|
97 |
+
# required_fields.value +
|
98 |
+
# [item for item in check_box['all'] if item not in required_fields.value]
|
99 |
+
# # checkbox_group.value
|
100 |
+
# )
|
101 |
+
|
102 |
+
table['Rank'] = list(range(1, len(table) + 1))
|
103 |
+
|
104 |
+
# Rearrange columns
|
105 |
+
if "Class" in table.columns:
|
106 |
+
starting_columns = ["Rank", "Models", "Class"]
|
107 |
+
else:
|
108 |
+
starting_columns = ["Rank", "Models"]
|
109 |
+
|
110 |
+
table = table[starting_columns + [ col for col in table.columns if col not in starting_columns ]]
|
111 |
+
|
112 |
+
headers = (
|
113 |
+
# ['Rank'] +
|
114 |
+
list(table.columns)
|
115 |
)
|
116 |
+
|
117 |
+
if "Class" in table.columns:
|
118 |
+
def cell_styler(v):
|
119 |
+
df = v.copy()
|
120 |
+
|
121 |
+
class_var = df[['Class']].copy()
|
122 |
+
|
123 |
+
df.loc[:, :] = ''
|
124 |
+
df[['Class']] = class_var.map(lambda x: f"background-color: {STYLE_CLASS_MAPPING[x]}")
|
125 |
+
logger.info(df['Class'])
|
126 |
+
|
127 |
+
return df
|
128 |
+
|
129 |
+
table_styler = (
|
130 |
+
table.style.apply(cell_styler, axis=None)
|
131 |
+
.format(precision=3)
|
132 |
+
)
|
133 |
+
else:
|
134 |
+
table_styler = table.style.format(prevision=3)
|
135 |
+
|
136 |
+
# with gr.Row():
|
137 |
+
# model_size = gr.CheckboxGroup(
|
138 |
+
# choices=MODEL_SIZE,
|
139 |
+
# value=MODEL_SIZE,
|
140 |
+
# label='Model Size',
|
141 |
+
# interactive=True
|
142 |
+
# )
|
143 |
+
# model_type = gr.CheckboxGroup(
|
144 |
+
# choices=MODEL_TYPE,
|
145 |
+
# value=MODEL_TYPE,
|
146 |
+
# label='Model Type',
|
147 |
+
# interactive=True
|
148 |
+
# )
|
149 |
+
data_component = gr.DataFrame(
|
150 |
+
value=table_styler,
|
151 |
type='pandas',
|
152 |
datatype=[type_map[x] for x in headers],
|
153 |
interactive=False,
|
154 |
+
visible=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
)
|
156 |
+
|
157 |
+
def filter_df(
|
158 |
+
required_fields,
|
159 |
+
fields,
|
160 |
+
# model_size,
|
161 |
+
# model_type
|
162 |
+
):
|
163 |
+
# filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
|
164 |
+
headers = ['Rank'] + required_fields + fields
|
165 |
+
|
166 |
+
# new_fields = [field for field in fields if field not in filter_list]
|
167 |
+
df = generate_table(fields)
|
168 |
+
logger.info(f"{df.columns=}")
|
169 |
+
|
170 |
+
# df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
|
171 |
+
# df = df[df['flag']]
|
172 |
+
# df.pop('flag')
|
173 |
+
|
174 |
+
# if len(df):
|
175 |
+
# df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
|
176 |
+
# df = df[df['flag']]
|
177 |
+
# df.pop('flag')
|
178 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
df['Rank'] = list(range(1, len(df) + 1))
|
180 |
|
181 |
+
comp = gr.DataFrame(
|
182 |
value=df[headers],
|
183 |
type='pandas',
|
184 |
+
datatype=[type_map[x] for x in headers],
|
185 |
interactive=False,
|
186 |
+
visible=True
|
187 |
+
)
|
188 |
+
|
189 |
return comp
|
190 |
|
191 |
+
# for cbox in [
|
192 |
+
# # checkbox_group,
|
193 |
+
# # model_size,
|
194 |
+
# # model_type
|
195 |
+
# ]:
|
196 |
+
# cbox.change(
|
197 |
+
# fn=refresh_dataframe,
|
198 |
+
# inputs=[required_fields],
|
199 |
+
# outputs=data_component
|
200 |
+
# ).then(
|
201 |
+
# fn=filter_df,
|
202 |
+
# inputs=[
|
203 |
+
# required_fields,
|
204 |
+
# checkbox_group,
|
205 |
+
# # model_size,
|
206 |
+
# # model_type
|
207 |
+
# ],
|
208 |
+
# outputs=data_component
|
209 |
+
# )
|
210 |
+
|
211 |
+
# with gr.Tab('🔍 About', elem_id='about', id=1):
|
212 |
+
# gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
|
213 |
+
|
214 |
+
# for i, dataset in enumerate(DATASETS):
|
215 |
+
# with gr.Tab(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
|
216 |
+
# if dataset in LEADERBOARD_MD:
|
217 |
+
# gr.Markdown(LEADERBOARD_MD[dataset])
|
218 |
+
|
219 |
+
# s = structs[i]
|
220 |
+
# s.table, s.check_box = build_l2_df(results, dataset)
|
221 |
+
# s.type_map = s.check_box['type_map']
|
222 |
+
# s.type_map['Rank'] = 'number'
|
223 |
+
|
224 |
+
# s.checkbox_group = gr.CheckboxGroup(
|
225 |
+
# choices=s.check_box['all'],
|
226 |
+
# value=s.check_box['required'],
|
227 |
+
# label=f'{dataset} CheckBoxes',
|
228 |
+
# interactive=True,
|
229 |
+
# )
|
230 |
+
# s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
|
231 |
+
# s.table['Rank'] = list(range(1, len(s.table) + 1))
|
232 |
+
|
233 |
+
# with gr.Row():
|
234 |
+
# s.model_size = gr.CheckboxGroup(
|
235 |
+
# choices=MODEL_SIZE,
|
236 |
+
# value=MODEL_SIZE,
|
237 |
+
# label='Model Size',
|
238 |
+
# interactive=True
|
239 |
+
# )
|
240 |
+
# s.model_type = gr.CheckboxGroup(
|
241 |
+
# choices=MODEL_TYPE,
|
242 |
+
# value=MODEL_TYPE,
|
243 |
+
# label='Model Type',
|
244 |
+
# interactive=True
|
245 |
+
# )
|
246 |
+
# s.data_component = gr.components.DataFrame(
|
247 |
+
# value=s.table[s.headers],
|
248 |
+
# type='pandas',
|
249 |
+
# datatype=[s.type_map[x] for x in s.headers],
|
250 |
+
# interactive=False,
|
251 |
+
# visible=True)
|
252 |
+
# s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
|
253 |
+
|
254 |
+
# def filter_df_l2(dataset_name, fields, model_size, model_type):
|
255 |
+
# s = structs[DATASETS.index(dataset_name)]
|
256 |
+
# headers = ['Rank'] + s.check_box['essential'] + fields
|
257 |
+
# df = cp.deepcopy(s.table)
|
258 |
+
# df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
|
259 |
+
# df = df[df['flag']]
|
260 |
+
# df.pop('flag')
|
261 |
+
# if len(df):
|
262 |
+
# df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
|
263 |
+
# df = df[df['flag']]
|
264 |
+
# df.pop('flag')
|
265 |
+
# df['Rank'] = list(range(1, len(df) + 1))
|
266 |
+
|
267 |
+
# comp = gr.components.DataFrame(
|
268 |
+
# value=df[headers],
|
269 |
+
# type='pandas',
|
270 |
+
# datatype=[s.type_map[x] for x in headers],
|
271 |
+
# interactive=False,
|
272 |
+
# visible=True)
|
273 |
+
# return comp
|
274 |
+
|
275 |
+
# for cbox in [s.checkbox_group, s.model_size, s.model_type]:
|
276 |
+
# cbox.change(
|
277 |
+
# fn=filter_df_l2,
|
278 |
+
# inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
|
279 |
+
# outputs=s.data_component)
|
280 |
|
281 |
with gr.Row():
|
282 |
with gr.Accordion('Citation', open=False):
|
|
|
285 |
label=CITATION_BUTTON_LABEL,
|
286 |
elem_id='citation-button')
|
287 |
|
288 |
+
|
289 |
if __name__ == '__main__':
|
290 |
+
import argparse
|
291 |
+
|
292 |
+
parser = argparse.ArgumentParser()
|
293 |
+
parser.add_argument("--host", type=str, default="0.0.0.0")
|
294 |
+
parser.add_argument("--port", type=int)
|
295 |
+
parser.add_argument(
|
296 |
+
"--share",
|
297 |
+
action="store_true",
|
298 |
+
help="Whether to generate a public, shareable link",
|
299 |
+
)
|
300 |
+
parser.add_argument(
|
301 |
+
"--concurrency-count",
|
302 |
+
type=int,
|
303 |
+
default=10,
|
304 |
+
help="The concurrency count of the gradio queue",
|
305 |
+
)
|
306 |
+
parser.add_argument(
|
307 |
+
"--max-threads",
|
308 |
+
type=int,
|
309 |
+
default=200,
|
310 |
+
help="The maximum number of threads available to process non-async functions.",
|
311 |
+
)
|
312 |
+
# parser.add_argument(
|
313 |
+
# "--gradio-auth-path",
|
314 |
+
# type=str,
|
315 |
+
# help='Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3"',
|
316 |
+
# default=None,
|
317 |
+
# )
|
318 |
+
parser.add_argument(
|
319 |
+
"--gradio-root-path",
|
320 |
+
type=str,
|
321 |
+
help="Sets the gradio root path, eg /abc/def. Useful when running behind a reverse-proxy or at a custom URL path prefix",
|
322 |
+
)
|
323 |
+
parser.add_argument(
|
324 |
+
"--ga-id",
|
325 |
+
type=str,
|
326 |
+
help="the Google Analytics ID",
|
327 |
+
default=None,
|
328 |
+
)
|
329 |
+
parser.add_argument(
|
330 |
+
"--use-remote-storage",
|
331 |
+
action="store_true",
|
332 |
+
default=False,
|
333 |
+
help="Uploads image files to google cloud storage if set to true",
|
334 |
+
)
|
335 |
+
args = parser.parse_args()
|
336 |
+
logger.info(f"args: {args}")
|
337 |
+
|
338 |
+
# Set authorization credentials
|
339 |
+
# auth = None
|
340 |
+
# if args.gradio_auth_path is not None:
|
341 |
+
# auth = parse_gradio_auth_creds(args.gradio_auth_path)
|
342 |
+
|
343 |
+
demo.queue(
|
344 |
+
default_concurrency_limit=args.concurrency_count,
|
345 |
+
status_update_rate=10,
|
346 |
+
api_open=False,
|
347 |
+
).launch(
|
348 |
+
server_name=args.host,
|
349 |
+
server_port=args.port,
|
350 |
+
share=args.share,
|
351 |
+
max_threads=args.max_threads,
|
352 |
+
# auth=auth,
|
353 |
+
root_path=args.gradio_root_path,
|
354 |
+
# debug=True,
|
355 |
+
show_error=True,
|
356 |
+
allowed_paths=["../.."]
|
357 |
+
)
|
commands.sh
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
srun \
|
2 |
+
--partition=llmeval \
|
3 |
+
--quotatype=reserved \
|
4 |
+
--job-name=judgerbench_server \
|
5 |
+
--ntasks=1 \
|
6 |
+
--ntasks-per-node=1 \
|
7 |
+
--cpus-per-task=2 \
|
8 |
+
--kill-on-bad-exit=1 \
|
9 |
+
--pty bash
|
10 |
+
|
11 |
+
|
12 |
+
bash -i start_gradio_web_server.sh
|
data/detail_a_cn.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
|
2 |
+
CJ-1-14B,0.69,0.61,0.51,0.55,0.71,0.68,0.6,0.58,0.61,0.65,0.619,Judge
|
3 |
+
GPT-4o-0806,0.77,0.56,0.51,0.53,0.67,0.66,0.63,0.58,0.62,0.58,0.611,API
|
4 |
+
CJ-1-32B,0.69,0.58,0.53,0.52,0.71,0.53,0.6,0.61,0.61,0.69,0.607,Judge
|
5 |
+
Skywork-llama3.1-8B,0.62,0.58,0.58,0.59,0.63,0.58,0.6,0.61,0.6,0.61,0.6,Judge
|
6 |
+
Qwen2.5-72B-Chat,0.65,0.47,0.49,0.47,0.71,0.6,0.57,0.58,0.69,0.6,0.583,General
|
7 |
+
CJ-1-7B,0.62,0.54,0.41,0.58,0.7,0.6,0.59,0.56,0.59,0.6,0.579,Judge
|
8 |
+
Qwen2-72B-Chat,0.62,0.54,0.34,0.55,0.68,0.63,0.58,0.58,0.62,0.64,0.578,General
|
9 |
+
Selftaught-llama3.1-70B,0.62,0.56,0.55,0.48,0.67,0.55,0.57,0.57,0.51,0.61,0.569,Judge
|
10 |
+
Qwen2.5-7B-Chat,0.46,0.58,0.36,0.45,0.7,0.53,0.52,0.53,0.52,0.64,0.529,General
|
11 |
+
CJ-1-1.5B,0.54,0.58,0.38,0.38,0.62,0.63,0.54,0.52,0.55,0.54,0.528,Judge
|
data/detail_a_en.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
|
2 |
+
GPT-4o-0806,0.82,0.53,0.62,0.61,0.83,0.67,0.67,0.73,0.64,0.55,0.667,API
|
3 |
+
Skywork-llama3.1-8B,0.69,0.61,0.54,0.62,0.63,0.64,0.6,0.69,0.74,0.53,0.629,Judge
|
4 |
+
Qwen2.5-72B-Chat,0.68,0.57,0.57,0.47,0.78,0.64,0.58,0.75,0.61,0.52,0.617,General
|
5 |
+
CJ-1-32B,0.66,0.57,0.56,0.59,0.78,0.58,0.55,0.75,0.6,0.49,0.613,Judge
|
6 |
+
CJ-1-14B,0.66,0.51,0.57,0.54,0.72,0.61,0.56,0.74,0.61,0.47,0.599,Judge
|
7 |
+
Qwen2-72B-Chat,0.63,0.59,0.54,0.49,0.62,0.64,0.6,0.74,0.51,0.52,0.588,General
|
8 |
+
CJ-1-7B,0.56,0.56,0.51,0.47,0.68,0.58,0.58,0.75,0.58,0.43,0.57,Judge
|
9 |
+
Qwen2.5-7B-Chat,0.54,0.59,0.59,0.46,0.69,0.43,0.61,0.65,0.58,0.52,0.566,General
|
10 |
+
CJ-1-1.5B,0.42,0.56,0.56,0.43,0.66,0.47,0.55,0.78,0.64,0.44,0.551,Judge
|
11 |
+
Selftaught-llama3.1-70B,0.47,0.45,0.47,0.37,0.45,0.43,0.36,0.58,0.48,0.36,0.442,Judge
|
data/detail_b_acc.csv
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
|
2 |
+
CJ-1-32B,0.857,0.806,0.596,0.621,0.72,Judge
|
3 |
+
CJ-1-14B,0.839,0.787,0.566,0.602,0.699,Judge
|
4 |
+
CJ-1-7B,0.816,0.783,0.564,0.586,0.687,Judge
|
5 |
+
Qwen2.5-72B-Chat,0.878,0.677,0.599,0.57,0.681,General
|
6 |
+
CJ-1-1.5B,0.822,0.712,0.55,0.43,0.629,Judge
|
7 |
+
Qwen2-72B-Chat,0.867,0.692,0.564,0.376,0.625,General
|
8 |
+
Selftaught-llama3.1-70B,0.755,0.627,0.538,0.472,0.598,Judge
|
9 |
+
Qwen2.5-7B-Chat,0.777,0.67,0.47,0.444,0.59,General
|
data/detail_b_corr.csv
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
|
2 |
+
CJ-1-32B,0.973,0.951,0.954,0.975,0.963,Judge
|
3 |
+
CJ-1-14B,0.966,0.956,0.965,0.951,0.959,Judge
|
4 |
+
CJ-1-7B,0.956,0.936,0.97,0.932,0.948,Judge
|
5 |
+
Qwen2.5-72B-Chat,0.964,0.916,0.958,0.912,0.937,General
|
6 |
+
Qwen2-72B-Chat,0.937,0.889,0.976,0.936,0.935,General
|
7 |
+
CJ-1-1.5B,0.928,0.851,0.981,0.858,0.905,Judge
|
8 |
+
Qwen2.5-7B-Chat,0.916,0.681,0.967,0.931,0.874,General
|
9 |
+
Selftaught-llama3.1-70B,0.918,0.667,0.95,0.942,0.869,Judge
|
data/overall.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Models,JDB-A EN,JDB-A CN,JDB-B Acc,JDB-B Corr,JudgerBench,Class
|
2 |
+
GPT-4o-0806,0.664,0.608,1,1,0.818,API
|
3 |
+
CJ-1-32B,0.614,0.612,0.72,0.963,0.727,Judge
|
4 |
+
CJ-1-14B,0.599,0.615,0.699,0.959,0.718,Judge
|
5 |
+
Qwen2.5-72B-Chat,0.615,0.59,0.681,0.937,0.706,General
|
6 |
+
CJ-1-7B,0.57,0.583,0.687,0.948,0.697,Judge
|
7 |
+
Qwen2-72B-Chat,0.588,0.584,0.625,0.935,0.683,General
|
8 |
+
CJ-1-1.5B,0.553,0.527,0.629,0.905,0.654,Judge
|
9 |
+
Qwen2.5-7B-Chat,0.567,0.535,0.59,0.874,0.641,General
|
10 |
+
Selftaught-llama3.1-70B,0.443,0.57,0.598,0.869,0.62,Judge
|
11 |
+
Skywork-llama3.1-8B,0.63,0.605,-,-,-,Judge
|
judgerbench/__init__.py
ADDED
File without changes
|
meta_data.py → judgerbench/meta_data.py
RENAMED
@@ -1,3 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# CONSTANTS-URL
|
2 |
URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
|
3 |
VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
|
@@ -9,32 +31,59 @@ CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
|
|
9 |
year={2023}
|
10 |
}"""
|
11 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
12 |
# CONSTANTS-TEXT
|
13 |
-
LEADERBORAD_INTRODUCTION = """#
|
14 |
-
### Welcome to the OpenVLM Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework:
|
15 |
-
### [*VLMEvalKit*: A Toolkit for Evaluating Large Vision-Language Models](https://github.com/open-compass/VLMEvalKit) 🏆
|
16 |
-
### Currently, OpenVLM Leaderboard covers {} different VLMs (including GPT-4v, Gemini, QwenVLPlus, LLaVA, etc.) and {} different multi-modal benchmarks.
|
17 |
|
18 |
-
|
19 |
|
20 |
-
|
21 |
"""
|
|
|
22 |
# CONSTANTS-FIELDS
|
23 |
-
META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
MAIN_FIELDS = [
|
25 |
-
'
|
26 |
-
'
|
27 |
-
'
|
28 |
-
'
|
29 |
-
'
|
30 |
]
|
|
|
|
|
|
|
|
|
31 |
DEFAULT_BENCH = [
|
32 |
-
'
|
33 |
-
'
|
|
|
|
|
|
|
34 |
]
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
# The README file for each benchmark
|
40 |
LEADERBOARD_MD = {}
|
|
|
1 |
+
from enum import IntEnum
|
2 |
+
from pathlib import Path
|
3 |
+
import os
|
4 |
+
|
5 |
+
# REPO paths
|
6 |
+
REPO_PATH = Path(os.path.dirname(os.path.dirname(__file__)))
|
7 |
+
DATADIR = REPO_PATH / Path(os.getenv("DATADIR", "./data"))
|
8 |
+
|
9 |
+
LEADERBOARD_FILE_MAPPING = dict(
|
10 |
+
overall="overall.csv",
|
11 |
+
detail_a_cn="detail_a_cn.csv",
|
12 |
+
detail_a_en="detail_a_en.csv",
|
13 |
+
detail_b_acc="detail_b_acc.csv",
|
14 |
+
detail_b_corr="detail_b_corr.csv",
|
15 |
+
)
|
16 |
+
|
17 |
+
STYLE_CLASS_MAPPING = {
|
18 |
+
"API": '#82e0aa',
|
19 |
+
"Judge": '#f8c471',
|
20 |
+
"General": '#85c1e9',
|
21 |
+
}
|
22 |
+
|
23 |
# CONSTANTS-URL
|
24 |
URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
|
25 |
VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
|
|
|
31 |
year={2023}
|
32 |
}"""
|
33 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
34 |
+
|
35 |
# CONSTANTS-TEXT
|
36 |
+
LEADERBORAD_INTRODUCTION = """# JudgerBench Leaderboard
|
|
|
|
|
|
|
37 |
|
38 |
+
### Welcome to the JudgerBench Leaderboard!
|
39 |
|
40 |
+
This leaderboard was last updated: {}.
|
41 |
"""
|
42 |
+
|
43 |
# CONSTANTS-FIELDS
|
44 |
+
# META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified']
|
45 |
+
META_FIELDS = [
|
46 |
+
'Model',
|
47 |
+
]
|
48 |
+
# MAIN_FIELDS = [
|
49 |
+
# 'MMBench_V11', 'MMStar', 'MME',
|
50 |
+
# 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
51 |
+
# 'HallusionBench', 'SEEDBench_IMG', 'MMVet',
|
52 |
+
# 'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
|
53 |
+
# 'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
|
54 |
+
# ]
|
55 |
MAIN_FIELDS = [
|
56 |
+
'Average',
|
57 |
+
'Accuracy',
|
58 |
+
'Accuracy_CN',
|
59 |
+
'Accuracy_EN',
|
60 |
+
'Corr',
|
61 |
]
|
62 |
+
# DEFAULT_BENCH = [
|
63 |
+
# 'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
|
64 |
+
# 'HallusionBench', 'MMVet'
|
65 |
+
# ]
|
66 |
DEFAULT_BENCH = [
|
67 |
+
'Average',
|
68 |
+
'Accuracy',
|
69 |
+
'Accuracy_CN',
|
70 |
+
'Accuracy_EN',
|
71 |
+
'Corr',
|
72 |
]
|
73 |
+
|
74 |
+
FIELD_MAPPING = {
|
75 |
+
'model': 'Model',
|
76 |
+
'parameters': 'Param (B)',
|
77 |
+
'average': 'Average',
|
78 |
+
'accuracy': 'Accuracy',
|
79 |
+
'accuracy_cn': 'Accuracy_CN',
|
80 |
+
'accuracy_en': 'Accuracy_EN',
|
81 |
+
'corr': 'Corr',
|
82 |
+
}
|
83 |
+
|
84 |
+
# MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
|
85 |
+
# MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
86 |
+
# MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
|
87 |
|
88 |
# The README file for each benchmark
|
89 |
LEADERBOARD_MD = {}
|
judgerbench/preprocess/__init__.py
ADDED
File without changes
|
gen_table.py → judgerbench/preprocess/gen_table.py
RENAMED
@@ -6,9 +6,19 @@ from urllib.request import urlopen
|
|
6 |
import gradio as gr
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
|
|
|
|
|
|
9 |
|
10 |
-
from meta_data import
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def listinstr(lst, s):
|
14 |
assert isinstance(lst, list)
|
@@ -18,7 +28,7 @@ def listinstr(lst, s):
|
|
18 |
return False
|
19 |
|
20 |
|
21 |
-
def
|
22 |
data = json.loads(urlopen(URL).read())
|
23 |
return data
|
24 |
|
@@ -61,23 +71,35 @@ def model_type_flag(line, FIELDS):
|
|
61 |
return False
|
62 |
|
63 |
|
64 |
-
def
|
65 |
check_box = {}
|
66 |
-
check_box['essential'] = [
|
|
|
|
|
|
|
|
|
67 |
# revise there to set default dataset
|
68 |
-
check_box['
|
69 |
-
|
70 |
-
check_box['
|
|
|
|
|
|
|
71 |
type_map = defaultdict(lambda: 'number')
|
72 |
-
type_map['Method'] = 'html'
|
73 |
-
type_map['
|
|
|
|
|
|
|
|
|
|
|
74 |
check_box['type_map'] = type_map
|
75 |
|
76 |
-
df = generate_table(
|
77 |
return df, check_box
|
78 |
|
79 |
|
80 |
-
def
|
81 |
res = defaultdict(list)
|
82 |
sub = [v for v in results.values() if dataset in v]
|
83 |
assert len(sub)
|
@@ -137,7 +159,7 @@ def BUILD_L2_DF(results, dataset):
|
|
137 |
return df, check_box
|
138 |
|
139 |
|
140 |
-
def
|
141 |
|
142 |
def get_mmbench_v11(item):
|
143 |
assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
|
@@ -194,3 +216,60 @@ def generate_table(results, fields):
|
|
194 |
missing = missing.iloc[::-1]
|
195 |
df = pd.concat([valid, missing])
|
196 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import gradio as gr
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import Union, List, Dict
|
11 |
+
from loguru import logger
|
12 |
|
13 |
+
from judgerbench.meta_data import (
|
14 |
+
DATADIR,
|
15 |
+
LEADERBOARD_FILE_MAPPING,
|
16 |
+
DEFAULT_BENCH,
|
17 |
+
FIELD_MAPPING,
|
18 |
+
STYLE_CLASS_MAPPING,
|
19 |
+
META_FIELDS,
|
20 |
+
URL
|
21 |
+
)
|
22 |
|
23 |
def listinstr(lst, s):
|
24 |
assert isinstance(lst, list)
|
|
|
28 |
return False
|
29 |
|
30 |
|
31 |
+
def load_results_from_url():
|
32 |
data = json.loads(urlopen(URL).read())
|
33 |
return data
|
34 |
|
|
|
71 |
return False
|
72 |
|
73 |
|
74 |
+
def build_l1_df(fields):
|
75 |
check_box = {}
|
76 |
+
check_box['essential'] = [
|
77 |
+
# 'Method',
|
78 |
+
# 'Param (B)',
|
79 |
+
'Model',
|
80 |
+
]
|
81 |
# revise there to set default dataset
|
82 |
+
check_box['default'] = DEFAULT_BENCH
|
83 |
+
|
84 |
+
check_box['avg'] = ['Average']
|
85 |
+
check_box['accuracy'] = ['Accuracy_CN', 'Accuracy_EN', 'Accuracy',]
|
86 |
+
check_box['all'] = fields
|
87 |
+
|
88 |
type_map = defaultdict(lambda: 'number')
|
89 |
+
# type_map['Method'] = 'html'
|
90 |
+
type_map['Model'] = 'str'
|
91 |
+
# type_map['Language Model'] = 'str'
|
92 |
+
# type_map['Vision Model'] = 'str'
|
93 |
+
# type_map['OpenSource'] = 'str'
|
94 |
+
# type_map['Verified'] = 'str'
|
95 |
+
|
96 |
check_box['type_map'] = type_map
|
97 |
|
98 |
+
df = generate_table(fields)
|
99 |
return df, check_box
|
100 |
|
101 |
|
102 |
+
def build_l2_df(results, dataset):
|
103 |
res = defaultdict(list)
|
104 |
sub = [v for v in results.values() if dataset in v]
|
105 |
assert len(sub)
|
|
|
159 |
return df, check_box
|
160 |
|
161 |
|
162 |
+
def generate_table1(results, fields):
|
163 |
|
164 |
def get_mmbench_v11(item):
|
165 |
assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
|
|
|
216 |
missing = missing.iloc[::-1]
|
217 |
df = pd.concat([valid, missing])
|
218 |
return df
|
219 |
+
|
220 |
+
|
221 |
+
def generate_table(
|
222 |
+
fields: List[str] = None,
|
223 |
+
filename: str = None,
|
224 |
+
path: Union[str, Path] = DATADIR / "overall.csv",
|
225 |
+
):
|
226 |
+
|
227 |
+
if filename in LEADERBOARD_FILE_MAPPING:
|
228 |
+
path = DATADIR / LEADERBOARD_FILE_MAPPING[filename]
|
229 |
+
|
230 |
+
if filename is None and path is None:
|
231 |
+
raise ValueError("filename and path cannot both be None.")
|
232 |
+
|
233 |
+
REQUIRED_FILEDS = META_FIELDS + [
|
234 |
+
# 'Average'
|
235 |
+
]
|
236 |
+
|
237 |
+
df = pd.read_csv(path)
|
238 |
+
# df_reshaped = (
|
239 |
+
# df
|
240 |
+
# .drop(columns=["dataset", "mode", "version"])
|
241 |
+
# .melt(
|
242 |
+
# id_vars=["metric"],
|
243 |
+
# var_name="model",
|
244 |
+
# value_name="value"
|
245 |
+
# )
|
246 |
+
# .pivot(index=["model"], columns=["metric"], values='value')
|
247 |
+
# )
|
248 |
+
# df_reshaped.columns.name = None
|
249 |
+
# df_reshaped.reset_index(inplace=True)
|
250 |
+
# df_reshaped.rename(columns=FIELD_MAPPING, inplace=True)
|
251 |
+
|
252 |
+
# if fields is not None:
|
253 |
+
# for field in fields:
|
254 |
+
# if field not in df_reshaped.columns:
|
255 |
+
# raise ValueError(f"{field} is not a valid field in leaderboard table.")
|
256 |
+
|
257 |
+
# new_fields = [field for field in FIELD_MAPPING.values() if field in REQUIRED_FILEDS + fields]
|
258 |
+
# logger.info(f"{new_fields=}")
|
259 |
+
|
260 |
+
# df_reshaped = df_reshaped.loc[:,new_fields].copy()
|
261 |
+
|
262 |
+
# valid, missing = df_reshaped[~pd.isna(df_reshaped['Average'])], df_reshaped[pd.isna(df_reshaped['Average'])]
|
263 |
+
# valid = valid.sort_values('Average', ascending=False)
|
264 |
+
|
265 |
+
# if len(fields):
|
266 |
+
# missing = missing.sort_values(
|
267 |
+
# 'Accuracy' if 'Accuracy' in fields else fields[0],
|
268 |
+
# ascending=False,
|
269 |
+
# )
|
270 |
+
|
271 |
+
# df_sorted = pd.concat([valid, missing])
|
272 |
+
|
273 |
+
df_sorted = df
|
274 |
+
|
275 |
+
return df_sorted
|
judgerbench/preprocess/generate_table.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#%%
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
from judgerbench.meta_data import (
|
5 |
+
DATADIR
|
6 |
+
)
|
7 |
+
|
8 |
+
#%%
|
9 |
+
df = pd.read_csv(DATADIR / "summary_20241007_221023.csv")
|
10 |
+
df
|
11 |
+
|
12 |
+
# %%
|
13 |
+
df_reshaped = (
|
14 |
+
df
|
15 |
+
.drop(columns=["dataset", "mode", "version"])
|
16 |
+
.melt(
|
17 |
+
id_vars=["metric"],
|
18 |
+
var_name="model",
|
19 |
+
value_name="value"
|
20 |
+
)
|
21 |
+
.pivot(index=["model"], columns=["metric"], values='value')
|
22 |
+
)
|
23 |
+
df_reshaped.columns.name = None
|
24 |
+
df_reshaped.reset_index(inplace=True)
|
25 |
+
|
26 |
+
df_reshaped
|
27 |
+
# %%
|
pyproject.toml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools >= 64"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
requirements.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
gradio==
|
2 |
numpy>=1.23.4
|
3 |
pandas>=1.5.3
|
|
|
1 |
+
gradio==5.1
|
2 |
numpy>=1.23.4
|
3 |
pandas>=1.5.3
|
setup.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Python setup.py for project_name package"""
|
2 |
+
import io
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
from setuptools import find_packages, setup
|
6 |
+
|
7 |
+
|
8 |
+
def read(*paths, **kwargs):
|
9 |
+
"""Read the contents of a text file safely.
|
10 |
+
>>> read("project_name", "VERSION")
|
11 |
+
'0.1.0'
|
12 |
+
>>> read("README.md")
|
13 |
+
...
|
14 |
+
"""
|
15 |
+
|
16 |
+
content = ""
|
17 |
+
with io.open(
|
18 |
+
os.path.join(os.path.dirname(__file__), *paths),
|
19 |
+
encoding=kwargs.get("encoding", "utf8"),
|
20 |
+
) as open_file:
|
21 |
+
content = open_file.read().strip()
|
22 |
+
return content
|
23 |
+
|
24 |
+
|
25 |
+
def read_requirements(path):
|
26 |
+
return [
|
27 |
+
line.strip()
|
28 |
+
for line in read(path).split("\n")
|
29 |
+
if not line.startswith(('"', "#", "-", "git+"))
|
30 |
+
]
|
31 |
+
|
32 |
+
|
33 |
+
setup(
|
34 |
+
name="judgerbench",
|
35 |
+
version='v0.0.1',
|
36 |
+
description="Judger Bench",
|
37 |
+
url="https://huggingface.co/spaces/acylam/judgerbench_leaderboard",
|
38 |
+
long_description=read("README.md"),
|
39 |
+
# long_description_content_type="text/markdown",
|
40 |
+
author="linjunyao",
|
41 |
+
maintainer="linjunyao",
|
42 |
+
package_dir={"": "judgerbench"},
|
43 |
+
packages=find_packages(
|
44 |
+
where="judgerbench",
|
45 |
+
include=["judgerbench", "judgerbench/**/*"],
|
46 |
+
exclude=["tests", ".github"]
|
47 |
+
),
|
48 |
+
install_requires=read_requirements("requirements.txt"),
|
49 |
+
# entry_points={
|
50 |
+
# "console_scripts": ["project_name = project_name.__main__:main"]
|
51 |
+
# },
|
52 |
+
# extras_require={"test": read_requirements("requirements-test.txt")},
|
53 |
+
)
|
start_gradio_web_server.sh
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
conda activate judgerbench
|
2 |
+
|
3 |
+
# GRADIO_HOSTNAME=0.0.0.0
|
4 |
+
GRADIO_HOSTNAME=$(hostname)
|
5 |
+
GRADIO_PORT=7861
|
6 |
+
|
7 |
+
# Set tmp and logs folders for gradio
|
8 |
+
export TMPDIR="tmp"
|
9 |
+
export LOGDIR="logs"
|
10 |
+
export GRADIO_SERVER_NAME="0.0.0.0"
|
11 |
+
export no_proxy="$CONTROLLER_HOST,10.140.1.173,0.0.0.0,$(hostname)"
|
12 |
+
|
13 |
+
# OpenAI proxy url
|
14 |
+
# export OPENAI_PROXY_URL='http://10.1.20.57:23128'
|
15 |
+
export OPENAI_PROXY_URL='http://closeai-proxy.pjlab.org.cn:23128'
|
16 |
+
|
17 |
+
# Source api keys
|
18 |
+
# source set_api_keys.sh
|
19 |
+
|
20 |
+
python3 -m app \
|
21 |
+
--host $GRADIO_HOSTNAME \
|
22 |
+
--port $GRADIO_PORT \
|
23 |
+
--concurrency-count 50
|