Spaces:
Running
Running
LinB203
commited on
Commit
•
ebc5bbb
1
Parent(s):
2f57651
first
Browse files- README.md +6 -6
- app.py +243 -0
- constants.py +75 -0
- file/result.csv +2 -0
- file/sample_to_upload.csv +2 -0
- requirements.txt +70 -0
- src/__pycache__/utils_display.cpython-38.pyc +0 -0
- src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc +0 -0
- src/auto_leaderboard/model_metadata_type.py +30 -0
- src/utils_display.py +99 -0
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
---
|
2 |
-
title: Video
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license:
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Video-Bench Leaderboard
|
3 |
+
emoji: 🏆
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.40.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: cc-by-4.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions']
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import pandas as pd
|
6 |
+
import json
|
7 |
+
import pdb
|
8 |
+
import tempfile
|
9 |
+
|
10 |
+
from constants import *
|
11 |
+
from src.auto_leaderboard.model_metadata_type import ModelType
|
12 |
+
|
13 |
+
global data_component, filter_component
|
14 |
+
|
15 |
+
|
16 |
+
def upload_file(files):
|
17 |
+
file_paths = [file.name for file in files]
|
18 |
+
return file_paths
|
19 |
+
|
20 |
+
def add_new_eval(
|
21 |
+
input_file,
|
22 |
+
model_name_textbox: str,
|
23 |
+
revision_name_textbox: str,
|
24 |
+
model_type: str,
|
25 |
+
model_link: str,
|
26 |
+
LLM_type: str,
|
27 |
+
LLM_name_textbox: str,
|
28 |
+
):
|
29 |
+
if input_file is None:
|
30 |
+
return "Error! Empty file!"
|
31 |
+
else:
|
32 |
+
input_data = input_file.decode("utf-8").split('\n')[1].split(',')
|
33 |
+
input_data = [str(i) for i in input_data]
|
34 |
+
|
35 |
+
csv_data = pd.read_csv(CSV_DIR)
|
36 |
+
|
37 |
+
|
38 |
+
if LLM_type == 'Other':
|
39 |
+
LLM_name = LLM_name_textbox
|
40 |
+
else:
|
41 |
+
LLM_name = LLM_type
|
42 |
+
|
43 |
+
if revision_name_textbox == '':
|
44 |
+
col = csv_data.shape[0]
|
45 |
+
model_name = model_name_textbox
|
46 |
+
else:
|
47 |
+
model_name = revision_name_textbox
|
48 |
+
model_name_list = csv_data['Model']
|
49 |
+
name_list = [name.split(']')[0][1:] for name in model_name_list]
|
50 |
+
if revision_name_textbox not in name_list:
|
51 |
+
col = csv_data.shape[0]
|
52 |
+
else:
|
53 |
+
col = name_list.index(revision_name_textbox)
|
54 |
+
|
55 |
+
if model_link == '':
|
56 |
+
model_name = model_name # no url
|
57 |
+
else:
|
58 |
+
model_name = '[' + model_name + '](' + model_link + ')'
|
59 |
+
|
60 |
+
# add new data
|
61 |
+
new_data = [
|
62 |
+
model_type,
|
63 |
+
model_name,
|
64 |
+
LLM_name,
|
65 |
+
input_data[0],
|
66 |
+
input_data[1],
|
67 |
+
input_data[2],
|
68 |
+
input_data[3],
|
69 |
+
input_data[4],
|
70 |
+
input_data[5],
|
71 |
+
input_data[6],
|
72 |
+
input_data[7],
|
73 |
+
input_data[8],
|
74 |
+
input_data[9],
|
75 |
+
input_data[10],
|
76 |
+
input_data[11],
|
77 |
+
input_data[12],
|
78 |
+
input_data[13],
|
79 |
+
input_data[14],
|
80 |
+
input_data[15],
|
81 |
+
input_data[16],
|
82 |
+
]
|
83 |
+
csv_data.loc[col] = new_data
|
84 |
+
csv_data = csv_data.to_csv(CSV_DIR, index=False)
|
85 |
+
return 0
|
86 |
+
|
87 |
+
def get_baseline_df():
|
88 |
+
# pdb.set_trace()
|
89 |
+
df = pd.read_csv(CSV_DIR, dtype=str)
|
90 |
+
df = df.sort_values(by="Avg. All", ascending=False)
|
91 |
+
present_columns = MODEL_INFO + checkbox_group.value
|
92 |
+
df = df[present_columns]
|
93 |
+
return df
|
94 |
+
|
95 |
+
def get_all_df():
|
96 |
+
df = pd.read_csv(CSV_DIR, dtype=str)
|
97 |
+
df = df.sort_values(by="Avg. All", ascending=False)
|
98 |
+
return df
|
99 |
+
|
100 |
+
block = gr.Blocks()
|
101 |
+
|
102 |
+
|
103 |
+
with block:
|
104 |
+
gr.Markdown(
|
105 |
+
LEADERBORAD_INTRODUCTION
|
106 |
+
)
|
107 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
108 |
+
with gr.TabItem("🏅 Video Benchmark", elem_id="video-benchmark-tab-table", id=0):
|
109 |
+
with gr.Row():
|
110 |
+
with gr.Accordion("Citation", open=False):
|
111 |
+
citation_button = gr.Textbox(
|
112 |
+
value=CITATION_BUTTON_TEXT,
|
113 |
+
label=CITATION_BUTTON_LABEL,
|
114 |
+
elem_id="citation-button",
|
115 |
+
).style(show_copy_button=True)
|
116 |
+
|
117 |
+
gr.Markdown(
|
118 |
+
TABLE_INTRODUCTION
|
119 |
+
)
|
120 |
+
|
121 |
+
# selection for column part:
|
122 |
+
checkbox_group = gr.CheckboxGroup(
|
123 |
+
choices=TASK_INFO_v2,
|
124 |
+
value=AVG_INFO,
|
125 |
+
label="Select options",
|
126 |
+
interactive=True,
|
127 |
+
)
|
128 |
+
|
129 |
+
# 创建数据帧组件
|
130 |
+
data_component = gr.components.Dataframe(
|
131 |
+
value=get_baseline_df,
|
132 |
+
headers=COLUMN_NAMES,
|
133 |
+
type="pandas",
|
134 |
+
datatype=DATA_TITILE_TYPE,
|
135 |
+
interactive=False,
|
136 |
+
visible=True,
|
137 |
+
)
|
138 |
+
|
139 |
+
def on_checkbox_group_change(selected_columns):
|
140 |
+
# pdb.set_trace()
|
141 |
+
selected_columns = [item for item in TASK_INFO_v2 if item in selected_columns]
|
142 |
+
present_columns = MODEL_INFO + selected_columns
|
143 |
+
updated_data = get_all_df()[present_columns]
|
144 |
+
updated_data = updated_data.sort_values(by=present_columns[3], ascending=False)
|
145 |
+
updated_headers = present_columns
|
146 |
+
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
147 |
+
|
148 |
+
filter_component = gr.components.Dataframe(
|
149 |
+
value=updated_data,
|
150 |
+
headers=updated_headers,
|
151 |
+
type="pandas",
|
152 |
+
datatype=update_datatype,
|
153 |
+
interactive=False,
|
154 |
+
visible=True,
|
155 |
+
)
|
156 |
+
# pdb.set_trace()
|
157 |
+
|
158 |
+
return filter_component.value
|
159 |
+
|
160 |
+
# 将复选框组关联到处理函数
|
161 |
+
checkbox_group.change(fn=on_checkbox_group_change, inputs=checkbox_group, outputs=data_component)
|
162 |
+
'''
|
163 |
+
# table 2
|
164 |
+
with gr.TabItem("📝 About", elem_id="seed-benchmark-tab-table", id=2):
|
165 |
+
gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
|
166 |
+
'''
|
167 |
+
# table 3
|
168 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="seed-benchmark-tab-table", id=3):
|
169 |
+
gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
|
170 |
+
|
171 |
+
with gr.Row():
|
172 |
+
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
173 |
+
|
174 |
+
with gr.Row():
|
175 |
+
gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text")
|
176 |
+
|
177 |
+
with gr.Row():
|
178 |
+
with gr.Column():
|
179 |
+
model_name_textbox = gr.Textbox(
|
180 |
+
label="Model name", placeholder="LLaMA-7B"
|
181 |
+
)
|
182 |
+
revision_name_textbox = gr.Textbox(
|
183 |
+
label="Revision Model Name", placeholder="LLaMA-7B"
|
184 |
+
)
|
185 |
+
model_type = gr.Dropdown(
|
186 |
+
choices=[
|
187 |
+
"LLM",
|
188 |
+
"ImageLLM",
|
189 |
+
"VideoLLM",
|
190 |
+
"Other",
|
191 |
+
],
|
192 |
+
label="Model type",
|
193 |
+
multiselect=False,
|
194 |
+
value="ImageLLM",
|
195 |
+
interactive=True,
|
196 |
+
)
|
197 |
+
|
198 |
+
with gr.Column():
|
199 |
+
|
200 |
+
LLM_type = gr.Dropdown(
|
201 |
+
choices=["Vicuna-7B", "Flan-T5-XL", "LLaMA-7B", "Other"],
|
202 |
+
label="LLM type",
|
203 |
+
multiselect=False,
|
204 |
+
value="LLaMA-7B",
|
205 |
+
interactive=True,
|
206 |
+
)
|
207 |
+
LLM_name_textbox = gr.Textbox(
|
208 |
+
label="LLM model (for Other)",
|
209 |
+
placeholder="LLaMA-13B"
|
210 |
+
)
|
211 |
+
model_link = gr.Textbox(
|
212 |
+
label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf"
|
213 |
+
)
|
214 |
+
|
215 |
+
with gr.Column():
|
216 |
+
|
217 |
+
input_file = gr.File(label="Click to Upload a csv File", type='binary')
|
218 |
+
submit_button = gr.Button("Submit Eval")
|
219 |
+
|
220 |
+
submission_result = gr.Markdown()
|
221 |
+
submit_button.click(
|
222 |
+
add_new_eval,
|
223 |
+
inputs=[
|
224 |
+
input_file,
|
225 |
+
model_name_textbox,
|
226 |
+
revision_name_textbox,
|
227 |
+
model_type,
|
228 |
+
model_link,
|
229 |
+
LLM_type,
|
230 |
+
LLM_name_textbox,
|
231 |
+
],
|
232 |
+
# outputs = submission_result,
|
233 |
+
)
|
234 |
+
|
235 |
+
with gr.Row():
|
236 |
+
data_run = gr.Button("Refresh")
|
237 |
+
data_run.click(
|
238 |
+
get_baseline_df, outputs=data_component
|
239 |
+
)
|
240 |
+
|
241 |
+
# block.load(get_baseline_df, outputs=data_title)
|
242 |
+
|
243 |
+
block.launch()
|
constants.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# this is .py for store constants
|
2 |
+
MODEL_INFO = ["Model Type", "Model", "Language Model"]
|
3 |
+
|
4 |
+
TASK_INFO_v2 = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making",
|
5 |
+
"ActivityNet", "MSVD", "MSRVTT", "TGIF", "Youcook2", "Ucfcrime", "MOT",
|
6 |
+
"TVQA", "MV", "NBA",
|
7 |
+
"Driving-exam", "Driving-decision-making", "SQA3D"]
|
8 |
+
|
9 |
+
AVG_INFO = ["Avg. All", "Avg. Video-Exclusive", "Avg. Prior-Knowledge QA", "Avg. Decision-Making"]
|
10 |
+
DATA_TITILE_TYPE = ["markdown", "markdown", "markdown", "markdown",
|
11 |
+
"number", "number", "number", "number", "number", "number", "number",
|
12 |
+
"number", "number", "number",
|
13 |
+
"number", "number", "number"]
|
14 |
+
CSV_DIR = "./file/result.csv"
|
15 |
+
|
16 |
+
# COLUMN_NAMES = MODEL_INFO + TASK_INFO
|
17 |
+
COLUMN_NAMES = MODEL_INFO + TASK_INFO_v2
|
18 |
+
|
19 |
+
|
20 |
+
LEADERBORAD_INTRODUCTION = """# Video-Bench Leaderboard
|
21 |
+
|
22 |
+
Welcome to the leaderboard of the Video-Bench! 🏆
|
23 |
+
Video-Bench consists of 15K questions with human-like video for evaluating Video-LLMs, covering three-level and 13 evaluation dimensions including both the spatial and temporal understanding.
|
24 |
+
Please refer to [our paper](https://arxiv.org/abs/2311.16103) for more details.
|
25 |
+
"""
|
26 |
+
|
27 |
+
SUBMIT_INTRODUCTION = """# Submit Introduction
|
28 |
+
1. Obtain JSON file from our [github repository](https://github.com/AILab-CVC/SEED-Bench#leaderboard-submit) after evaluation. For example, you can obtain InstructBLIP's JSON file as results/results.json after running
|
29 |
+
```shell
|
30 |
+
python eval.py --model instruct_blip --anno_path SEED-Bench.json --output-dir results
|
31 |
+
```
|
32 |
+
2. If you want to update model performance by uploading new results, please ensure 'Model Name Revision' is the same as what's shown in the leaderboard. For example, if you want to modify InstructBLIP's performance, you need to fill in 'InstructBLIP' in 'Revision Model Name'.
|
33 |
+
3. Please provide the correct link of your model's repository for each submission.
|
34 |
+
4. For the evaluation dimension, you can choose "All/Image/Video", and the results of dimensions that are not evaluated will be set to zero.
|
35 |
+
5. After clicking 'Submit Eval', you can click 'Refresh' to obtain the latest result in the leaderboard.
|
36 |
+
|
37 |
+
## Submit Example
|
38 |
+
For example, if you want to upload InstructBLIP's result in the leaderboard, you need to:
|
39 |
+
1. Fill in 'InstructBLIP' in 'Model Name' if it is your first time to submit your result (You can leave 'Revision Model Name' blank).
|
40 |
+
2. Fill in 'InstructBLIP' in 'Revision Model Name' if you want to update your result (You can leave 'Model Name' blank).
|
41 |
+
2. Select 'ImageLLM' in 'Model Type'.
|
42 |
+
3. Fill in 'https://github.com/salesforce/LAVIS' in 'Model Link'.
|
43 |
+
4. Select 'Flan-T5-XL' in 'LLM Type'.
|
44 |
+
5. Select 'All' in 'Evaluation Dimension'.
|
45 |
+
6. Upload results.json.
|
46 |
+
7. Click the 'Submit Eval' button.
|
47 |
+
8. Click 'Refresh' to obtain the uploaded leaderboard.
|
48 |
+
"""
|
49 |
+
|
50 |
+
TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
|
51 |
+
We use accurancy(%) as the primary evaluation metric for each tasks.
|
52 |
+
"""
|
53 |
+
|
54 |
+
LEADERBORAD_INFO = """
|
55 |
+
Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
|
56 |
+
In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
|
57 |
+
SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
|
58 |
+
We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
|
59 |
+
Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
|
60 |
+
We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
|
61 |
+
By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
|
62 |
+
"""
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
67 |
+
CITATION_BUTTON_TEXT = r"""@misc{ning2023videobench,
|
68 |
+
title={Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models},
|
69 |
+
author={Munan Ning and Bin Zhu and Yujia Xie and Bin Lin and Jiaxi Cui and Lu Yuan and Dongdong Chen and Li Yuan},
|
70 |
+
year={2023},
|
71 |
+
eprint={2311.16103},
|
72 |
+
archivePrefix={arXiv},
|
73 |
+
primaryClass={cs.CV}
|
74 |
+
}
|
75 |
+
}"""
|
file/result.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Model Type,Model,Language Model,Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
|
2 |
+
LLM,[Flan-T5](https://huggingface.co/google/flan-t5-xl),Flan-T5-XL,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
file/sample_to_upload.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Avg. All,Avg. Video-Exclusive,Avg. Prior-Knowledge QA,Avg. Decision-Making,ActivityNet,MSVD,MSRVTT,TGIF,Youcook2,Ucfcrime,MOT,TVQA,MV,NBA,Driving-exam,Driving-decision-making,SQA3D
|
2 |
+
4,2,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
requirements.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
anyio==3.6.2
|
6 |
+
APScheduler==3.10.1
|
7 |
+
async-timeout==4.0.2
|
8 |
+
attrs==23.1.0
|
9 |
+
certifi==2022.12.7
|
10 |
+
charset-normalizer==3.1.0
|
11 |
+
click==8.1.3
|
12 |
+
contourpy==1.0.7
|
13 |
+
cycler==0.11.0
|
14 |
+
datasets==2.12.0
|
15 |
+
entrypoints==0.4
|
16 |
+
fastapi==0.95.1
|
17 |
+
ffmpy==0.3.0
|
18 |
+
filelock==3.11.0
|
19 |
+
fonttools==4.39.3
|
20 |
+
frozenlist==1.3.3
|
21 |
+
fsspec==2023.4.0
|
22 |
+
gradio==3.27.0
|
23 |
+
gradio_client==0.1.3
|
24 |
+
h11==0.14.0
|
25 |
+
httpcore==0.17.0
|
26 |
+
httpx==0.24.0
|
27 |
+
huggingface-hub==0.13.4
|
28 |
+
idna==3.4
|
29 |
+
Jinja2==3.1.2
|
30 |
+
jsonschema==4.17.3
|
31 |
+
kiwisolver==1.4.4
|
32 |
+
linkify-it-py==2.0.0
|
33 |
+
markdown-it-py==2.2.0
|
34 |
+
MarkupSafe==2.1.2
|
35 |
+
matplotlib==3.7.1
|
36 |
+
mdit-py-plugins==0.3.3
|
37 |
+
mdurl==0.1.2
|
38 |
+
multidict==6.0.4
|
39 |
+
numpy==1.24.2
|
40 |
+
orjson==3.8.10
|
41 |
+
packaging==23.1
|
42 |
+
pandas==2.0.0
|
43 |
+
Pillow==9.5.0
|
44 |
+
plotly==5.14.1
|
45 |
+
pyarrow==11.0.0
|
46 |
+
pydantic==1.10.7
|
47 |
+
pydub==0.25.1
|
48 |
+
pyparsing==3.0.9
|
49 |
+
pyrsistent==0.19.3
|
50 |
+
python-dateutil==2.8.2
|
51 |
+
python-multipart==0.0.6
|
52 |
+
pytz==2023.3
|
53 |
+
pytz-deprecation-shim==0.1.0.post0
|
54 |
+
PyYAML==6.0
|
55 |
+
requests==2.28.2
|
56 |
+
semantic-version==2.10.0
|
57 |
+
six==1.16.0
|
58 |
+
sniffio==1.3.0
|
59 |
+
starlette==0.26.1
|
60 |
+
toolz==0.12.0
|
61 |
+
tqdm==4.65.0
|
62 |
+
transformers==4.28.1
|
63 |
+
typing_extensions==4.5.0
|
64 |
+
tzdata==2023.3
|
65 |
+
tzlocal==4.3
|
66 |
+
uc-micro-py==1.0.1
|
67 |
+
urllib3==1.26.15
|
68 |
+
uvicorn==0.21.1
|
69 |
+
websockets==11.0.1
|
70 |
+
yarl==1.8.2
|
src/__pycache__/utils_display.cpython-38.pyc
ADDED
Binary file (4.22 kB). View file
|
|
src/auto_leaderboard/__pycache__/model_metadata_type.cpython-38.pyc
ADDED
Binary file (1.17 kB). View file
|
|
src/auto_leaderboard/model_metadata_type.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
+
import glob
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from typing import Dict, List
|
7 |
+
|
8 |
+
from ..utils_display import AutoEvalColumn
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class ModelInfo:
|
12 |
+
name: str
|
13 |
+
symbol: str # emoji
|
14 |
+
|
15 |
+
model_type_symbols = {
|
16 |
+
"LLM": "🟢",
|
17 |
+
"ImageLLM": "🔶",
|
18 |
+
"VideoLLM": "⭕",
|
19 |
+
"Other": "🟦",
|
20 |
+
}
|
21 |
+
|
22 |
+
class ModelType(Enum):
|
23 |
+
PT = ModelInfo(name="LLM", symbol="🟢")
|
24 |
+
FT = ModelInfo(name="ImageLLM", symbol="🔶")
|
25 |
+
IFT = ModelInfo(name="VideoLLM", symbol="⭕")
|
26 |
+
RL = ModelInfo(name="Other", symbol="🟦")
|
27 |
+
|
28 |
+
def to_str(self, separator = " "):
|
29 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
30 |
+
|
src/utils_display.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
# These classes are for user facing column names, to avoid having to change them
|
4 |
+
# all around the code when a modif is needed
|
5 |
+
@dataclass
|
6 |
+
class ColumnContent:
|
7 |
+
name: str
|
8 |
+
type: str
|
9 |
+
displayed_by_default: bool
|
10 |
+
hidden: bool = False
|
11 |
+
|
12 |
+
def fields(raw_class):
|
13 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
14 |
+
|
15 |
+
@dataclass(frozen=True)
|
16 |
+
class AutoEvalColumn: # Auto evals column
|
17 |
+
model_type_symbol = ColumnContent("T", "str", True)
|
18 |
+
model = ColumnContent("Model", "markdown", True)
|
19 |
+
average = ColumnContent("Average ⬆️", "number", True)
|
20 |
+
arc = ColumnContent("ARC", "number", True)
|
21 |
+
hellaswag = ColumnContent("HellaSwag", "number", True)
|
22 |
+
mmlu = ColumnContent("MMLU", "number", True)
|
23 |
+
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
24 |
+
model_type = ColumnContent("Type", "str", False)
|
25 |
+
precision = ColumnContent("Precision", "str", False, True)
|
26 |
+
license = ColumnContent("Hub License", "str", False)
|
27 |
+
params = ColumnContent("#Params (B)", "number", False)
|
28 |
+
likes = ColumnContent("Hub ❤️", "number", False)
|
29 |
+
revision = ColumnContent("Model sha", "str", False, False)
|
30 |
+
dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
|
31 |
+
|
32 |
+
@dataclass(frozen=True)
|
33 |
+
class EloEvalColumn: # Elo evals column
|
34 |
+
model = ColumnContent("Model", "markdown", True)
|
35 |
+
gpt4 = ColumnContent("GPT-4 (all)", "number", True)
|
36 |
+
human_all = ColumnContent("Human (all)", "number", True)
|
37 |
+
human_instruct = ColumnContent("Human (instruct)", "number", True)
|
38 |
+
human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
|
39 |
+
|
40 |
+
|
41 |
+
@dataclass(frozen=True)
|
42 |
+
class EvalQueueColumn: # Queue column
|
43 |
+
model = ColumnContent("model", "markdown", True)
|
44 |
+
revision = ColumnContent("revision", "str", True)
|
45 |
+
private = ColumnContent("private", "bool", True)
|
46 |
+
precision = ColumnContent("precision", "bool", True)
|
47 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
48 |
+
status = ColumnContent("status", "str", True)
|
49 |
+
|
50 |
+
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|
51 |
+
|
52 |
+
|
53 |
+
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
54 |
+
VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
|
55 |
+
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
56 |
+
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
57 |
+
MODEL_PAGE = "https://huggingface.co/models"
|
58 |
+
LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
|
59 |
+
VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
|
60 |
+
ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
|
61 |
+
|
62 |
+
|
63 |
+
def model_hyperlink(link, model_name):
|
64 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
65 |
+
|
66 |
+
|
67 |
+
def make_clickable_model(model_name):
|
68 |
+
link = f"https://huggingface.co/{model_name}"
|
69 |
+
|
70 |
+
if model_name in LLAMAS:
|
71 |
+
link = LLAMA_LINK
|
72 |
+
model_name = model_name.split("/")[1]
|
73 |
+
elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
|
74 |
+
link = VICUNA_LINK
|
75 |
+
model_name = "stable-vicuna-13b"
|
76 |
+
elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
|
77 |
+
link = ALPACA_LINK
|
78 |
+
model_name = "alpaca-13b"
|
79 |
+
if model_name == "dolly-12b":
|
80 |
+
link = DOLLY_LINK
|
81 |
+
elif model_name == "vicuna-13b":
|
82 |
+
link = VICUNA_LINK
|
83 |
+
elif model_name == "koala-13b":
|
84 |
+
link = KOALA_LINK
|
85 |
+
elif model_name == "oasst-12b":
|
86 |
+
link = OASST_LINK
|
87 |
+
#else:
|
88 |
+
# link = MODEL_PAGE
|
89 |
+
|
90 |
+
return model_hyperlink(link, model_name)
|
91 |
+
|
92 |
+
def styled_error(error):
|
93 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
94 |
+
|
95 |
+
def styled_warning(warn):
|
96 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
97 |
+
|
98 |
+
def styled_message(message):
|
99 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|