cccjc commited on
Commit
8553d06
1 Parent(s): 955908d

dump from mock space

Browse files
.gitattributes CHANGED
@@ -25,6 +25,7 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -32,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
- scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
.gitignore CHANGED
@@ -1,13 +1,228 @@
1
- auto_evals/
2
- venv/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  __pycache__/
4
- .env
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  .ipynb_checkpoints
6
- *ipynb
7
- .vscode/
8
-
9
- eval-queue/
10
- eval-results/
11
- eval-queue-bk/
12
- eval-results-bk/
13
- logs/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos
3
+
4
+ ### macOS ###
5
+ # General
6
+ .DS_Store
7
+ .AppleDouble
8
+ .LSOverride
9
+
10
+ # Icon must end with two \r
11
+ Icon
12
+
13
+
14
+ # Thumbnails
15
+ ._*
16
+
17
+ # Files that might appear in the root of a volume
18
+ .DocumentRevisions-V100
19
+ .fseventsd
20
+ .Spotlight-V100
21
+ .TemporaryItems
22
+ .Trashes
23
+ .VolumeIcon.icns
24
+ .com.apple.timemachine.donotpresent
25
+
26
+ # Directories potentially created on remote AFP share
27
+ .AppleDB
28
+ .AppleDesktop
29
+ Network Trash Folder
30
+ Temporary Items
31
+ .apdisk
32
+
33
+ ### macOS Patch ###
34
+ # iCloud generated files
35
+ *.icloud
36
+
37
+ ### Python ###
38
+ # Byte-compiled / optimized / DLL files
39
  __pycache__/
40
+ *.py[cod]
41
+ *$py.class
42
+
43
+ # C extensions
44
+ *.so
45
+
46
+ # Distribution / packaging
47
+ .Python
48
+ build/
49
+ develop-eggs/
50
+ dist/
51
+ downloads/
52
+ eggs/
53
+ .eggs/
54
+ lib/
55
+ lib64/
56
+ parts/
57
+ sdist/
58
+ var/
59
+ wheels/
60
+ share/python-wheels/
61
+ *.egg-info/
62
+ .installed.cfg
63
+ *.egg
64
+ MANIFEST
65
+
66
+ # PyInstaller
67
+ # Usually these files are written by a python script from a template
68
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
69
+ *.manifest
70
+ *.spec
71
+
72
+ # Installer logs
73
+ pip-log.txt
74
+ pip-delete-this-directory.txt
75
+
76
+ # Unit test / coverage reports
77
+ htmlcov/
78
+ .tox/
79
+ .nox/
80
+ .coverage
81
+ .coverage.*
82
+ .cache
83
+ nosetests.xml
84
+ coverage.xml
85
+ *.cover
86
+ *.py,cover
87
+ .hypothesis/
88
+ .pytest_cache/
89
+ cover/
90
+
91
+ # Translations
92
+ *.mo
93
+ *.pot
94
+
95
+ # Django stuff:
96
+ *.log
97
+ local_settings.py
98
+ db.sqlite3
99
+ db.sqlite3-journal
100
+
101
+ # Flask stuff:
102
+ instance/
103
+ .webassets-cache
104
+
105
+ # Scrapy stuff:
106
+ .scrapy
107
+
108
+ # Sphinx documentation
109
+ docs/_build/
110
+
111
+ # PyBuilder
112
+ .pybuilder/
113
+ target/
114
+
115
+ # Jupyter Notebook
116
  .ipynb_checkpoints
117
+
118
+ # IPython
119
+ profile_default/
120
+ ipython_config.py
121
+
122
+ # pyenv
123
+ # For a library or package, you might want to ignore these files since the code is
124
+ # intended to run in multiple environments; otherwise, check them in:
125
+ # .python-version
126
+
127
+ # pipenv
128
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
129
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
130
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
131
+ # install all needed dependencies.
132
+ #Pipfile.lock
133
+
134
+ # poetry
135
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
136
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
137
+ # commonly ignored for libraries.
138
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
139
+ #poetry.lock
140
+
141
+ # pdm
142
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
143
+ #pdm.lock
144
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
145
+ # in version control.
146
+ # https://pdm.fming.dev/#use-with-ide
147
+ .pdm.toml
148
+
149
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
150
+ __pypackages__/
151
+
152
+ # Celery stuff
153
+ celerybeat-schedule
154
+ celerybeat.pid
155
+
156
+ # SageMath parsed files
157
+ *.sage.py
158
+
159
+ # Environments
160
+ .env
161
+ .venv
162
+ env/
163
+ venv/
164
+ ENV/
165
+ env.bak/
166
+ venv.bak/
167
+
168
+ # Spyder project settings
169
+ .spyderproject
170
+ .spyproject
171
+
172
+ # Rope project settings
173
+ .ropeproject
174
+
175
+ # mkdocs documentation
176
+ /site
177
+
178
+ # mypy
179
+ .mypy_cache/
180
+ .dmypy.json
181
+ dmypy.json
182
+
183
+ # Pyre type checker
184
+ .pyre/
185
+
186
+ # pytype static type analyzer
187
+ .pytype/
188
+
189
+ # Cython debug symbols
190
+ cython_debug/
191
+
192
+ # PyCharm
193
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
194
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
195
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
196
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
197
+ #.idea/
198
+
199
+ ### Python Patch ###
200
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
201
+ poetry.toml
202
+
203
+ # ruff
204
+ .ruff_cache/
205
+
206
+ # LSP config files
207
+ pyrightconfig.json
208
+
209
+ ### VisualStudioCode ###
210
+ .vscode/*
211
+ !.vscode/settings.json
212
+ !.vscode/tasks.json
213
+ !.vscode/launch.json
214
+ !.vscode/extensions.json
215
+ !.vscode/*.code-snippets
216
+
217
+ # Local History for Visual Studio Code
218
+ .history/
219
+
220
+ # Built Visual Studio Code Extensions
221
+ *.vsix
222
+
223
+ ### VisualStudioCode Patch ###
224
+ # Ignore all local history of files
225
+ .history
226
+ .ionide
227
+
228
+ # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos
README.md CHANGED
@@ -1,45 +1,12 @@
1
  ---
2
- title: MEGA Bench
3
  emoji: 🥇
4
- colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
- short_description: The space for the leaderboard and information of MEGA-Bench
11
  ---
12
 
13
- # Start the configuration
14
-
15
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
16
-
17
- Results files should have the following format and be stored as json files:
18
- ```json
19
- {
20
- "config": {
21
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
- "model_name": "path of the model on the hub: org/model",
23
- "model_sha": "revision on the hub",
24
- },
25
- "results": {
26
- "task_name": {
27
- "metric_name": score,
28
- },
29
- "task_name2": {
30
- "metric_name": score,
31
- }
32
- }
33
- }
34
- ```
35
-
36
- Request files are created automatically by this tool.
37
-
38
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
-
40
- # Code logic for more complex edits
41
-
42
- You'll find
43
- - the main table' columns names and properties in `src/display/utils.py`
44
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
  ---
2
+ title: MEGA-Bench
3
  emoji: 🥇
4
+ colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
11
  ---
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,204 +1,79 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
  )
48
- except Exception:
49
- restart_space()
50
-
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
  )
90
-
91
-
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
 
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
  with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
 
 
 
 
147
  with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
 
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
 
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
- with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
2
+ from utils import get_leaderboard_data, SUPER_GROUPS, MODEL_GROUPS
3
+ import os
4
+ from constants import *
5
+
6
+ # Get the directory of the current script
7
+ current_dir = os.path.dirname(os.path.abspath(__file__))
8
+
9
+ # Construct the path to the CSS file
10
+ css_file = os.path.join(current_dir, "static", "css", "style.css")
11
+
12
+ # Read the CSS file
13
+ with open(css_file, "r") as f:
14
+ css = f.read()
15
+
16
+ def update_leaderboard(selected_super_group, selected_model_group):
17
+ headers, data = get_leaderboard_data(selected_super_group, selected_model_group)
18
+ return gr.Dataframe(
19
+ value=data,
20
+ headers=headers,
21
+ datatype=["str"] + ["number"] * (len(headers) - 1),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  )
 
 
 
23
 
24
+ with gr.Blocks(css=css) as block:
25
+ gr.Markdown(
26
+ LEADERBOARD_INTRODUCTION
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  )
 
 
 
 
 
 
 
28
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
29
+ with gr.TabItem("📊 MEGA-Bench", elem_id="qa-tab-table1", id=1):
30
+ with gr.Row():
31
+ with gr.Accordion("Citation", open=False):
32
+ citation_button = gr.Textbox(
33
+ value=CITATION_BUTTON_TEXT,
34
+ label=CITATION_BUTTON_LABEL,
35
+ elem_id="citation-button",
36
+ lines=10,
37
+ )
38
+ gr.Markdown(
39
+ TABLE_INTRODUCTION
40
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
42
  with gr.Row():
43
+ super_group_selector = gr.Radio(
44
+ choices=list(SUPER_GROUPS.keys()),
45
+ label="Select a dimension to display breakdown results",
46
+ value=list(SUPER_GROUPS.keys())[0]
47
+ )
48
+ model_group_selector = gr.Radio(
49
+ choices=list(MODEL_GROUPS.keys()),
50
+ label="Select a model group",
51
+ value="All"
52
+ )
53
+
54
+ initial_headers, initial_data = get_leaderboard_data(list(SUPER_GROUPS.keys())[0], "All")
55
+ data_component = gr.Dataframe(
56
+ value=initial_data,
57
+ headers=initial_headers,
58
+ datatype=["str"] + ["number"] * (len(initial_headers) - 1),
59
+ interactive=False,
60
+ elem_classes="custom-dataframe",
61
+ )
62
+ refresh_button = gr.Button("Refresh")
63
+ refresh_button.click(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
64
+ super_group_selector.change(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
65
+ model_group_selector.change(fn=update_leaderboard, inputs=[super_group_selector, model_group_selector], outputs=[data_component])
66
 
67
+ with gr.TabItem("📝 Data Information", elem_id="qa-tab-table2", id=2):
68
+ gr.Markdown(DATA_INFO, elem_classes="markdown-text")
69
+
70
+ with gr.TabItem("🚀 Submit here! ", elem_id="submit-tab", id=3):
71
  with gr.Row():
72
+ gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ if __name__ == "__main__":
77
+ block.launch(share=True)
78
+ #block.launch(server_name="127.0.0.1", server_port=7860)
 
 
 
 
 
 
79
 
 
 
 
 
constants.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ HF_TOKEN = os.environ.get("HF_TOKEN")
4
+
5
+ LEADERBOARD_INTRODUCTION = """# MEGA-Bench Leaderboard
6
+
7
+ ## 🚀 Introduction
8
+
9
+ [MEGA-Bench](https://tiger-ai-lab.github.io/MEGA-Bench/) is a comprehensive benchmark scaling multimodal evaluation to 500+ real-world tasks!
10
+
11
+ We aim to provide cost-effective and accurate evaluation for multimodal models, covering a wide range of real-world tasks. You don't have to run models on dozens of benchmarks -- MEGA-Bench delivers a comprehensive performance report in a single benchmark.
12
+
13
+ ## 🧐 Highlights of MEGA-Bench
14
+
15
+ - 505 diverse tasks evaluating multimodal models across 8 grand application types, 7 input visual formats, 6 output formats, and 10 general multimodal skills, covering single-image, multi-image, and video tasks
16
+ - Moves beyond multiple-choice questions, offering diverse output formats like numbers, code, LATEX, phrases, free-form responses, and more. We developed 45 customized metrics to accurately evaluate these diverse outputs
17
+ - Focuses on task diversity rather than repetitive examples, ensuring cost-efficient evaluation
18
+ - Provides fine-grained capability reports across application type, input/output formats, and required skills
19
+
20
+
21
+ ## 🔨 Systematic Annotation Process
22
+
23
+ - Guided by an initial application-driven taxonomy tree
24
+ - 16 expert annotators contributing to a 2-round process to develop 505 tasks
25
+ - Utilizes advanced tools for task design, review, and quality control
26
+ - Ensures high-quality data through continuous refinement and balanced task distribution
27
+
28
+
29
+ ## 📊🔍 Results & Takeaways from Evaluating Top Models
30
+
31
+ - GPT4o leads the benchmark, outperforming others by 3.5% over Claude3.5
32
+ - Qwen2VL stands out among open-source models, nearing flagship-level performance
33
+ - Chain-of-Thought (CoT) improves proprietary models but has limited impact on open-source models
34
+ - Efficiency models like Gemini 1.5 Flash perform well but struggle with UI and document tasks
35
+ - Many open-source models face challenges in adhering to output format instructions
36
+
37
+ ## 🎯 Interactive Visualization
38
+
39
+ Visit our [project page](https://tiger-ai-lab.github.io/MEGA-Bench/) to explore the interactive task taxonomy and radar maps, offering deep insights into model capabilities across multiple dimensions. Discover a comprehensive breakdown far beyond single-score evaluations.
40
+
41
+
42
+ ## 📚 More Information
43
+
44
+ - Our evaluation pipeline will soon be available on our GitHub: https://github.com/TIGER-AI-Lab/MEGA-Bench.
45
+ - Check full details of our paper at [https://arxiv.org/abs/2410.10563](https://arxiv.org/abs/2410.10563)
46
+ - Hugging Face Datasets: [https://huggingface.co/datasets/TIGER-Lab/MEGA-Bench](https://huggingface.co/datasets/TIGER-Lab/MEGA-Bench)
47
+ - GitHub: [https://github.com/TIGER-AI-Lab/MEGA-Bench](https://github.com/TIGER-AI-Lab/MEGA-Bench)
48
+
49
+ """
50
+
51
+ TABLE_INTRODUCTION = """
52
+ """
53
+
54
+ DATA_INFO = """
55
+ ### Data Sources
56
+ The data source of MEGA-Bench tasks have three main types:
57
+ - **Purely Self-designed:** The task is designed entirely by the annotator, and the annotator looks for the image or video resources from the Internet or even using code/simulator.
58
+ - **Inspired and adapted from existing benchmarks:** The task is inspired by existing benchmarks or datasets. The annotator collects the raw image/video data from existing datasets but does not use the original annotation. The annotator redesigns/repurposes the data by writing concrete task descriptions and creating new questions and answers, or using scripts to re-process the data for the designed task.
59
+ - **Directly converted from existing benchmarks:** The task is directly converted from existing benchmarks or datasets. The annotator randomly samples a subset from the existing benchmark, directly using its image/video and the annotation without redesign.
60
+
61
+ In our annotation process, the first two task types are encouraged. The task reviewers strictly control the number of the third type and reject the task if an annotator submits many tasks of the third type.
62
+
63
+ Please refer to Table 17 of our [paper](https://arxiv.org/abs/2410.10563) for the detailed data source of all tasks in MEGA-Bench.
64
+ """
65
+
66
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite our paper and evaluation results below"
67
+ CITATION_BUTTON_TEXT = r"""
68
+ @article{chen2024mega-bench,
69
+ title={MEGA-Bench: Scaling Multimodal Evaluation to over 500 Real-World Tasks},
70
+ author={Chen, Jiacheng and Liang, Tianhao and Siu, Sherman and Wang, Zhengqing and Wang, Kai and Wang, Yubo and Ni, Yuansheng and Zhu, Wang and Jiang, Ziyan and Lyu, Bohan and Jiang, Dongfu and He, Xuan and Liu, Yuan and Hu, Hexiang and Yue, Xiang and Chen, Wenhu},
71
+ journal={arXiv preprint arXiv:2410.10563},
72
+ year={2024},
73
+ }
74
+ """
75
+
76
+ SUBMIT_INTRODUCTION = """# Submit on MEGA-Bench Leaderboard
77
+
78
+ We will provide details on how to submitting the results files once our evaluation pipeline is released on our [GitHub repository](https://github.com/TIGER-AI-Lab/MEGA-Bench).
79
+
80
+
81
+ """
requirements.txt CHANGED
@@ -1,16 +1,18 @@
1
- APScheduler
2
- black
3
- datasets
4
- gradio
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.9
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
- numpy
11
- pandas
12
- python-dateutil
13
- tqdm
14
- transformers
15
  tokenizers>=0.15.0
 
 
16
  sentencepiece
 
1
+ APScheduler==3.10.1
2
+ black==23.11.0
3
+ click==8.1.3
4
+ datasets==2.14.5
5
+ gradio==5.1.0
6
+ gradio_client==1.4.0
7
+ huggingface-hub==0.25.2
8
+ matplotlib==3.7.1
9
+ numpy==1.24.2
10
+ pandas==2.0.0
11
+ python-dateutil==2.8.2
12
+ requests==2.28.2
13
+ tqdm==4.65.0
14
+ transformers==4.35.2
15
  tokenizers>=0.15.0
16
+ git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
+ accelerate==0.24.1
18
  sentencepiece
static/css/style.css ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .custom-dataframe {
2
+ width: 100% !important;
3
+ overflow-x: auto !important;
4
+ }
5
+
6
+ .custom-dataframe table {
7
+ width: auto !important;
8
+ min-width: 100% !important;
9
+ font-size: 14px !important;
10
+ }
11
+
12
+ .custom-dataframe thead th {
13
+ padding: 4px 8px !important;
14
+ text-align: center !important;
15
+ vertical-align: middle !important;
16
+ white-space: nowrap !important;
17
+ overflow: visible !important;
18
+ font-size: 12px !important;
19
+ font-weight: bold !important;
20
+ line-height: 1.2 !important;
21
+ }
22
+
23
+ .custom-dataframe tbody td {
24
+ padding: 4px 8px !important;
25
+ text-align: right !important;
26
+ vertical-align: middle !important;
27
+ white-space: nowrap !important;
28
+ overflow: visible !important;
29
+ line-height: 1.2 !important;
30
+ }
31
+
32
+ .custom-dataframe tbody td:first-child {
33
+ text-align: left !important;
34
+ }
35
+
36
+ /* Adjust the sort indicator position */
37
+ .custom-dataframe thead th::after {
38
+ font-size: 12px !important;
39
+ line-height: 1 !important;
40
+ margin-left: 4px !important;
41
+ }
42
+
43
+ /* Style for global result columns */
44
+ .custom-dataframe thead th:nth-child(-n+4),
45
+ .custom-dataframe tbody td:nth-child(-n+4) {
46
+ background-color: #f0f8ff !important; /* Light blue background */
47
+ }
48
+
49
+ /* Style for dimension-specific result columns */
50
+ .custom-dataframe thead th:nth-child(n+5),
51
+ .custom-dataframe tbody td:nth-child(n+5) {
52
+ background-color: #f0fff0 !important; /* Light green background */
53
+ }
54
+
55
+ /* Alternating row colors for better readability */
56
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(-n+4) {
57
+ background-color: #e6f3ff !important; /* Slightly darker light blue */
58
+ }
59
+
60
+ .custom-dataframe tbody tr:nth-child(even) td:nth-child(n+5) {
61
+ background-color: #e6ffe6 !important; /* Slightly darker light green */
62
+ }
static/eval_results/all_model_keywords_stats.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/all_summary.json ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "GPT_4o": {
3
+ "core_noncot": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "num_not_eval_samples": 0,
7
+ "num_total_samples": 6961,
8
+ "macro_mean_score": 0.5187898818829914,
9
+ "micro_mean_score": 0.5127977300993917
10
+ },
11
+ "core_cot": {
12
+ "num_eval_tasks": 440,
13
+ "num_eval_samples": 6539,
14
+ "num_not_eval_samples": 0,
15
+ "num_total_samples": 6961,
16
+ "macro_mean_score": 0.5251654337401854,
17
+ "micro_mean_score": 0.522332974147119
18
+ },
19
+ "open": {
20
+ "num_eval_tasks": 65,
21
+ "num_eval_samples": 1163,
22
+ "num_total_samples": 2448,
23
+ "macro_mean_score": 0.6478225794744895,
24
+ "micro_mean_score": 0.665391229578676
25
+ },
26
+ "overall_score": 0.5409529871515315
27
+ },
28
+ "Gemini_1.5_pro_002": {
29
+ "core_noncot": {
30
+ "num_eval_tasks": 440,
31
+ "num_eval_samples": 6539,
32
+ "num_not_eval_samples": 0,
33
+ "num_total_samples": 6961,
34
+ "macro_mean_score": 0.46887846869580546,
35
+ "micro_mean_score": 0.46403536258864253
36
+ },
37
+ "core_cot": {
38
+ "num_eval_tasks": 440,
39
+ "num_eval_samples": 6539,
40
+ "num_not_eval_samples": 0,
41
+ "num_total_samples": 6961,
42
+ "macro_mean_score": 0.481393687771543,
43
+ "micro_mean_score": 0.4756661334397647
44
+ },
45
+ "open": {
46
+ "num_eval_tasks": 65,
47
+ "num_eval_samples": 1163,
48
+ "num_total_samples": 2448,
49
+ "macro_mean_score": 0.5858190649927173,
50
+ "micro_mean_score": 0.6104901117798793
51
+ },
52
+ "overall_score": 0.4948345779089219
53
+ },
54
+ "Gemini_1.5_flash_002": {
55
+ "core_noncot": {
56
+ "num_eval_tasks": 440,
57
+ "num_eval_samples": 6539,
58
+ "num_not_eval_samples": 0,
59
+ "num_total_samples": 6961,
60
+ "macro_mean_score": 0.4183865592515826,
61
+ "micro_mean_score": 0.41216971462683855
62
+ },
63
+ "core_cot": {
64
+ "num_eval_tasks": 440,
65
+ "num_eval_samples": 6539,
66
+ "num_not_eval_samples": 0,
67
+ "num_total_samples": 6961,
68
+ "macro_mean_score": 0.4183865592515826,
69
+ "micro_mean_score": 0.41216971462683855
70
+ },
71
+ "open": {
72
+ "num_eval_tasks": 65,
73
+ "num_eval_samples": 1163,
74
+ "num_total_samples": 2168,
75
+ "macro_mean_score": 0.5691365176285039,
76
+ "micro_mean_score": 0.5987532244196045
77
+ },
78
+ "overall_score": 0.4377900192406913
79
+ },
80
+ "Claude_3.5": {
81
+ "core_noncot": {
82
+ "num_eval_tasks": 440,
83
+ "num_eval_samples": 6539,
84
+ "num_not_eval_samples": 0,
85
+ "num_total_samples": 6961,
86
+ "macro_mean_score": 0.4863241841253708,
87
+ "micro_mean_score": 0.4798092874490549
88
+ },
89
+ "core_cot": {
90
+ "num_eval_tasks": 440,
91
+ "num_eval_samples": 6539,
92
+ "num_not_eval_samples": 0,
93
+ "num_total_samples": 6961,
94
+ "macro_mean_score": 0.5023557473841108,
95
+ "micro_mean_score": 0.4985442599850241
96
+ },
97
+ "open": {
98
+ "num_eval_tasks": 65,
99
+ "num_eval_samples": 1163,
100
+ "num_total_samples": 2288,
101
+ "macro_mean_score": 0.6373907158949892,
102
+ "micro_mean_score": 0.6569647463456579
103
+ },
104
+ "overall_score": 0.519736485905313
105
+ },
106
+ "GPT_4o_mini": {
107
+ "core_noncot": {
108
+ "num_eval_tasks": 440,
109
+ "num_eval_samples": 6539,
110
+ "num_not_eval_samples": 0,
111
+ "num_total_samples": 6961,
112
+ "macro_mean_score": 0.3974259652331149,
113
+ "micro_mean_score": 0.392578163407945
114
+ },
115
+ "core_cot": {
116
+ "num_eval_tasks": 440,
117
+ "num_eval_samples": 6539,
118
+ "num_not_eval_samples": 0,
119
+ "num_total_samples": 6961,
120
+ "macro_mean_score": 0.4070959243997505,
121
+ "micro_mean_score": 0.40376078514357017
122
+ },
123
+ "open": {
124
+ "num_eval_tasks": 65,
125
+ "num_eval_samples": 1163,
126
+ "num_total_samples": 1224,
127
+ "macro_mean_score": 0.586537827213665,
128
+ "micro_mean_score": 0.6133276010318144
129
+ },
130
+ "overall_score": 0.43019240694015537
131
+ },
132
+ "Qwen2_VL_72B": {
133
+ "core_noncot": {
134
+ "num_eval_tasks": 440,
135
+ "num_eval_samples": 6539,
136
+ "num_not_eval_samples": 0,
137
+ "num_total_samples": 6961,
138
+ "macro_mean_score": 0.4623988230573754,
139
+ "micro_mean_score": 0.4568583770401895
140
+ },
141
+ "core_cot": {
142
+ "num_eval_tasks": 440,
143
+ "num_eval_samples": 6539,
144
+ "num_not_eval_samples": 0,
145
+ "num_total_samples": 6961,
146
+ "macro_mean_score": 0.45284699372478177,
147
+ "micro_mean_score": 0.4487693487093462
148
+ },
149
+ "open": {
150
+ "num_eval_tasks": 65,
151
+ "num_eval_samples": 1163,
152
+ "num_total_samples": 2448,
153
+ "macro_mean_score": 0.5639771804231668,
154
+ "micro_mean_score": 0.5835339638865004
155
+ },
156
+ "overall_score": 0.4754732650945565
157
+ },
158
+ "Qwen2_VL_7B": {
159
+ "core_noncot": {
160
+ "num_eval_tasks": 440,
161
+ "num_eval_samples": 6539,
162
+ "num_not_eval_samples": 0,
163
+ "num_total_samples": 6961,
164
+ "macro_mean_score": 0.34725455697890745,
165
+ "micro_mean_score": 0.34344091516995323
166
+ },
167
+ "core_cot": {
168
+ "num_eval_tasks": 440,
169
+ "num_eval_samples": 6539,
170
+ "num_not_eval_samples": 0,
171
+ "num_total_samples": 6961,
172
+ "macro_mean_score": 0.3284357723853296,
173
+ "micro_mean_score": 0.32443422147119677
174
+ },
175
+ "open": {
176
+ "num_eval_tasks": 65,
177
+ "num_eval_samples": 1170,
178
+ "num_total_samples": 2452,
179
+ "macro_mean_score": 0.43955105763038577,
180
+ "micro_mean_score": 0.45508547008546996
181
+ },
182
+ "overall_score": 0.35913430458751355
183
+ },
184
+ "llava_onevision_72B": {
185
+ "core_noncot": {
186
+ "num_eval_tasks": 440,
187
+ "num_eval_samples": 6539,
188
+ "num_not_eval_samples": 0,
189
+ "num_total_samples": 6961,
190
+ "macro_mean_score": 0.31960132549012704,
191
+ "micro_mean_score": 0.3173848563095166
192
+ },
193
+ "core_cot": {
194
+ "num_eval_tasks": 440,
195
+ "num_eval_samples": 6539,
196
+ "num_not_eval_samples": 0,
197
+ "num_total_samples": 6961,
198
+ "macro_mean_score": 0.29725827011768174,
199
+ "micro_mean_score": 0.2954433666362564
200
+ },
201
+ "open": {
202
+ "num_eval_tasks": 65,
203
+ "num_eval_samples": 1163,
204
+ "num_total_samples": 1224,
205
+ "macro_mean_score": 0.4599484231632498,
206
+ "micro_mean_score": 0.4850386930352536
207
+ },
208
+ "overall_score": 0.33766580340844976
209
+ },
210
+ "llava_onevision_7B": {
211
+ "core_noncot": {
212
+ "num_eval_tasks": 440,
213
+ "num_eval_samples": 6539,
214
+ "num_not_eval_samples": 0,
215
+ "num_total_samples": 6961,
216
+ "macro_mean_score": 0.2239290419841492,
217
+ "micro_mean_score": 0.22222171180488767
218
+ },
219
+ "core_cot": {
220
+ "num_eval_tasks": 440,
221
+ "num_eval_samples": 6539,
222
+ "num_not_eval_samples": 0,
223
+ "num_total_samples": 6961,
224
+ "macro_mean_score": 0.21347545703998197,
225
+ "micro_mean_score": 0.210586172002703
226
+ },
227
+ "open": {
228
+ "num_eval_tasks": 65,
229
+ "num_eval_samples": 1163,
230
+ "num_total_samples": 2448,
231
+ "macro_mean_score": 0.33979975321921935,
232
+ "micro_mean_score": 0.36474634565778147
233
+ },
234
+ "overall_score": 0.23884309392529685
235
+ },
236
+ "InternVL2_76B": {
237
+ "core_noncot": {
238
+ "num_eval_tasks": 440,
239
+ "num_eval_samples": 6539,
240
+ "num_not_eval_samples": 0,
241
+ "num_total_samples": 6961,
242
+ "macro_mean_score": 0.34977582844066846,
243
+ "micro_mean_score": 0.3452353155814884
244
+ },
245
+ "core_cot": {
246
+ "num_eval_tasks": 440,
247
+ "num_eval_samples": 6539,
248
+ "num_not_eval_samples": 0,
249
+ "num_total_samples": 6961,
250
+ "macro_mean_score": 0.35539585884136143,
251
+ "micro_mean_score": 0.35043335903915124
252
+ },
253
+ "open": {
254
+ "num_eval_tasks": 65,
255
+ "num_eval_samples": 1163,
256
+ "num_total_samples": 1224,
257
+ "macro_mean_score": 0.5192997443033639,
258
+ "micro_mean_score": 0.5421324161650903
259
+ },
260
+ "overall_score": 0.37649239855429245
261
+ },
262
+ "InternVL2_8B": {
263
+ "core_noncot": {
264
+ "num_eval_tasks": 440,
265
+ "num_eval_samples": 6539,
266
+ "num_not_eval_samples": 0,
267
+ "num_total_samples": 6961,
268
+ "macro_mean_score": 0.25920867490737526,
269
+ "micro_mean_score": 0.2543416126895087
270
+ },
271
+ "core_cot": {
272
+ "num_eval_tasks": 440,
273
+ "num_eval_samples": 6539,
274
+ "num_not_eval_samples": 0,
275
+ "num_total_samples": 6961,
276
+ "macro_mean_score": 0.24055897165959364,
277
+ "micro_mean_score": 0.23784634936127952
278
+ },
279
+ "open": {
280
+ "num_eval_tasks": 65,
281
+ "num_eval_samples": 1165,
282
+ "num_total_samples": 2452,
283
+ "macro_mean_score": 0.3978571701460552,
284
+ "micro_mean_score": 0.4108583690987125
285
+ },
286
+ "overall_score": 0.2770545208291856
287
+ },
288
+ "MiniCPM_v2.6": {
289
+ "core_noncot": {
290
+ "num_eval_tasks": 440,
291
+ "num_eval_samples": 6539,
292
+ "num_not_eval_samples": 0,
293
+ "num_total_samples": 6961,
294
+ "macro_mean_score": 0.22838207666977445,
295
+ "micro_mean_score": 0.22452805919103805
296
+ },
297
+ "core_cot": {
298
+ "num_eval_tasks": 440,
299
+ "num_eval_samples": 6539,
300
+ "num_not_eval_samples": 0,
301
+ "num_total_samples": 6961,
302
+ "macro_mean_score": 0.22901463640480854,
303
+ "micro_mean_score": 0.2250606411323753
304
+ },
305
+ "open": {
306
+ "num_eval_tasks": 65,
307
+ "num_eval_samples": 1163,
308
+ "num_total_samples": 2448,
309
+ "macro_mean_score": 0.41728623355613875,
310
+ "micro_mean_score": 0.43452278589853827
311
+ },
312
+ "overall_score": 0.25324761425596987
313
+ },
314
+ "Phi-3.5-vision": {
315
+ "core_noncot": {
316
+ "num_eval_tasks": 440,
317
+ "num_eval_samples": 6539,
318
+ "num_not_eval_samples": 0,
319
+ "num_total_samples": 6961,
320
+ "macro_mean_score": 0.23240864879023493,
321
+ "micro_mean_score": 0.22932978620408923
322
+ },
323
+ "core_cot": {
324
+ "num_eval_tasks": 440,
325
+ "num_eval_samples": 6539,
326
+ "num_not_eval_samples": 0,
327
+ "num_total_samples": 6961,
328
+ "macro_mean_score": 0.2295097914016776,
329
+ "micro_mean_score": 0.2266573336398296
330
+ },
331
+ "open": {
332
+ "num_eval_tasks": 65,
333
+ "num_eval_samples": 1163,
334
+ "num_total_samples": 2428,
335
+ "macro_mean_score": 0.3947914647737769,
336
+ "micro_mean_score": 0.42459157351676696
337
+ },
338
+ "overall_score": 0.2533094072831661
339
+ },
340
+ "Pixtral_12B": {
341
+ "core_noncot": {
342
+ "num_eval_tasks": 440,
343
+ "num_eval_samples": 6539,
344
+ "num_not_eval_samples": 0,
345
+ "num_total_samples": 6961,
346
+ "macro_mean_score": 0.3186510310643637,
347
+ "micro_mean_score": 0.3151734861550665
348
+ },
349
+ "core_cot": {
350
+ "num_eval_tasks": 440,
351
+ "num_eval_samples": 6539,
352
+ "num_not_eval_samples": 0,
353
+ "num_total_samples": 6961,
354
+ "macro_mean_score": 0.3132232487306254,
355
+ "micro_mean_score": 0.30971424472967524
356
+ },
357
+ "open": {
358
+ "num_eval_tasks": 65,
359
+ "num_eval_samples": 1163,
360
+ "num_total_samples": 1224,
361
+ "macro_mean_score": 0.4566234428542061,
362
+ "micro_mean_score": 0.4870593293207223
363
+ },
364
+ "overall_score": 0.3364098563442444
365
+ },
366
+ "Llama_3_2_11B": {
367
+ "core_noncot": {
368
+ "num_eval_tasks": 440,
369
+ "num_eval_samples": 6539,
370
+ "num_not_eval_samples": 0,
371
+ "num_total_samples": 6961,
372
+ "macro_mean_score": 0.10044261716549671,
373
+ "micro_mean_score": 0.09980638766828835
374
+ },
375
+ "core_cot": {
376
+ "num_eval_tasks": 440,
377
+ "num_eval_samples": 6539,
378
+ "num_not_eval_samples": 0,
379
+ "num_total_samples": 6961,
380
+ "macro_mean_score": 0.15984490401619783,
381
+ "micro_mean_score": 0.15794038158731832
382
+ },
383
+ "open": {
384
+ "num_eval_tasks": 65,
385
+ "num_eval_samples": 1163,
386
+ "num_total_samples": 1224,
387
+ "macro_mean_score": 0.3173342406187366,
388
+ "micro_mean_score": 0.3487962166809973
389
+ },
390
+ "overall_score": 0.1801158087274157
391
+ },
392
+ "Idefics3": {
393
+ "core_noncot": {
394
+ "num_eval_tasks": 440,
395
+ "num_eval_samples": 6539,
396
+ "num_not_eval_samples": 0,
397
+ "num_total_samples": 6961,
398
+ "macro_mean_score": 0.11118980301103833,
399
+ "micro_mean_score": 0.11201785633274061
400
+ },
401
+ "core_cot": {
402
+ "num_eval_tasks": 440,
403
+ "num_eval_samples": 6539,
404
+ "num_not_eval_samples": 0,
405
+ "num_total_samples": 6961,
406
+ "macro_mean_score": 0.08956972487602757,
407
+ "micro_mean_score": 0.08982225274252693
408
+ },
409
+ "open": {
410
+ "num_eval_tasks": 65,
411
+ "num_eval_samples": 1163,
412
+ "num_total_samples": 2448,
413
+ "macro_mean_score": 0.3210866162255635,
414
+ "micro_mean_score": 0.35649183147033553
415
+ },
416
+ "overall_score": 0.138206224513898
417
+ }
418
+ }
utils.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ import csv
4
+ import json
5
+ import os
6
+ import shutil
7
+ from huggingface_hub import Repository
8
+ import numpy as np
9
+
10
+ # Load the JSON data
11
+ with open("./static/eval_results/all_model_keywords_stats.json", "r") as f:
12
+ MODEL_DATA = json.load(f)
13
+
14
+ with open("./static/eval_results/all_summary.json", "r") as f:
15
+ SUMMARY_DATA = json.load(f)
16
+
17
+
18
+ # Define model name mapping
19
+ MODEL_NAME_MAP = {
20
+ "GPT_4o": "GPT-4o (0513)",
21
+ "Claude_3.5": "Claude-3.5-Sonnet",
22
+ "Gemini_1.5_pro_002": "Gemini-1.5-Pro-002",
23
+ "InternVL2_76B": "InternVL2-Llama3-76B",
24
+ "Qwen2_VL_72B": "Qwen2-VL-72B",
25
+ "llava_onevision_72B": "Llava-OneVision-72B",
26
+ "GPT_4o_mini": "GPT-4o mini",
27
+ "Gemini_1.5_flash_002": "Gemini-1.5-Flash-002",
28
+ "Pixtral_12B": "Pixtral 12B",
29
+ "Qwen2_VL_7B": "Qwen2-VL-7B",
30
+ "InternVL2_8B": "InternVL2-8B",
31
+ "llava_onevision_7B": "Llava-OneVision-7B",
32
+ "Llama_3_2_11B": "Llama-3.2-11B",
33
+ "Phi-3.5-vision": "Phi-3.5-Vision",
34
+ "MiniCPM_v2.6": "MiniCPM-V2.6",
35
+ "Idefics3": "Idefics3-8B-Llama3",
36
+ }
37
+
38
+ # Custom name mapping for dimensions and keywords
39
+ DIMENSION_NAME_MAP = {
40
+ "skills": "Skills",
41
+ "input_format": "Input Format",
42
+ "output_format": "Output Format",
43
+ "input_num": "Visual Input Number",
44
+ "app": "Application"
45
+ }
46
+
47
+ KEYWORD_NAME_MAP = {
48
+ # Skills
49
+ "Object Recognition and Classification": "Object Recognition",
50
+ "Text Recognition (OCR)": "OCR",
51
+ "Language Understanding and Generation": "Language",
52
+ "Scene and Event Understanding": "Scene/Event",
53
+ "Mathematical and Logical Reasoning": "Math/Logic",
54
+ "Commonsense and Social Reasoning": "Commonsense",
55
+ "Ethical and Safety Reasoning": "Ethics/Safety",
56
+ "Domain-Specific Knowledge and Skills": "Domain-Specific",
57
+ "Spatial and Temporal Reasoning": "Spatial/Temporal",
58
+ "Planning and Decision Making": "Planning/Decision",
59
+ # Input Format
60
+ 'User Interface Screenshots': "UI related",
61
+ 'Text-Based Images and Documents': "Documents",
62
+ 'Diagrams and Data Visualizations': "Infographics",
63
+ 'Videos': "Videos",
64
+ 'Artistic and Creative Content': "Arts/Creative",
65
+ 'Photographs': "Photographs",
66
+ '3D Models and Aerial Imagery': "3D related",
67
+ # Application
68
+ 'Information_Extraction': "Info Extraction",
69
+ 'Planning' : "Planning",
70
+ 'Coding': "Coding",
71
+ 'Perception': "Perception",
72
+ 'Metrics': "Metrics",
73
+ 'Science': "Science",
74
+ 'Knowledge': "Knowledge",
75
+ 'Mathematics': "Math",
76
+ # Output format
77
+ 'contextual_formatted_text': "Contexual",
78
+ 'structured_output': "Structured",
79
+ 'exact_text': "Exact",
80
+ 'numerical_data': "Numerical",
81
+ 'open_ended_output': "Open-ended",
82
+ 'multiple_choice': "MC",
83
+ "6-8 images": "6-8 imgs",
84
+ "1-image": "1 img",
85
+ "2-3 images": "2-3 imgs",
86
+ "4-5 images": "4-5 imgs",
87
+ "9-image or more": "9+ imgs",
88
+ "video": "Video",
89
+ }
90
+
91
+ # Extract super groups (dimensions) and their keywords
92
+ SUPER_GROUPS = {DIMENSION_NAME_MAP[dim]: [KEYWORD_NAME_MAP.get(k, k) for k in MODEL_DATA[next(iter(MODEL_DATA))][dim].keys()]
93
+ for dim in MODEL_DATA[next(iter(MODEL_DATA))]}
94
+
95
+ SUBMISSION_NAME = "test_leaderboard_submission"
96
+ SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/cccjc/", SUBMISSION_NAME)
97
+ CSV_DIR = "./test_leaderboard_submission/results.csv"
98
+
99
+ def get_original_dimension(mapped_dimension):
100
+ return next(k for k, v in DIMENSION_NAME_MAP.items() if v == mapped_dimension)
101
+
102
+ def get_original_keyword(mapped_keyword):
103
+ return next((k for k, v in KEYWORD_NAME_MAP.items() if v == mapped_keyword), mapped_keyword)
104
+
105
+ # Define model groups
106
+ MODEL_GROUPS = {
107
+ "All": list(MODEL_DATA.keys()),
108
+ "Flagship Models": ['GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B'],
109
+ "Efficienty Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
110
+ "Proprietary Flagship models": ['GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
111
+ "Open-source Efficienty Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
112
+ "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B'],
113
+ "Proprietary Efficienty Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3'],
114
+ }
115
+
116
+ def get_display_model_name(model_name):
117
+ return MODEL_NAME_MAP.get(model_name, model_name)
118
+
119
+ def get_df(selected_super_group, selected_model_group):
120
+ original_dimension = get_original_dimension(selected_super_group)
121
+ data = []
122
+ for model in MODEL_GROUPS[selected_model_group]:
123
+ model_data = MODEL_DATA[model]
124
+ summary = SUMMARY_DATA[model]
125
+ core_score = max(summary["core_noncot"]["macro_mean_score"], summary["core_cot"]["macro_mean_score"])
126
+ row = {
127
+ "Models": get_display_model_name(model), # Use the mapped name
128
+ "Overall": round(summary["overall_score"] * 100, 2),
129
+ "Core": round(core_score * 100, 2),
130
+ "Open-ended": round(summary["open"]["macro_mean_score"] * 100, 2)
131
+ }
132
+ for keyword in SUPER_GROUPS[selected_super_group]:
133
+ original_keyword = get_original_keyword(keyword)
134
+ if original_dimension in model_data and original_keyword in model_data[original_dimension]:
135
+ row[keyword] = round(model_data[original_dimension][original_keyword]["average_score"] * 100, 2)
136
+ else:
137
+ row[keyword] = None
138
+ data.append(row)
139
+
140
+ df = pd.DataFrame(data)
141
+ df = df.sort_values(by="Overall", ascending=False)
142
+ return df
143
+
144
+ def get_leaderboard_data(selected_super_group, selected_model_group):
145
+ df = get_df(selected_super_group, selected_model_group)
146
+ headers = ["Models", "Overall", "Core", "Open-ended"] + SUPER_GROUPS[selected_super_group]
147
+ data = df[headers].values.tolist()
148
+ return headers, data