Clémentine commited on
Commit
2a5f9fb
1 Parent(s): e3aaf53

refacto part 1

Browse files
app.py CHANGED
@@ -1,14 +1,24 @@
1
  import json
2
  import os
3
- from datetime import datetime, timezone
4
 
5
  import gradio as gr
6
  import pandas as pd
7
  from apscheduler.schedulers.background import BackgroundScheduler
8
- from huggingface_hub import HfApi, snapshot_download
9
-
10
- from src.assets.css_html_js import custom_css, get_window_url_params
11
- from src.assets.text_content import (
 
 
 
 
 
 
 
 
 
 
 
12
  CITATION_BUTTON_LABEL,
13
  CITATION_BUTTON_TEXT,
14
  EVALUATION_QUEUE_TEXT,
@@ -16,102 +26,44 @@ from src.assets.text_content import (
16
  LLM_BENCHMARKS_TEXT,
17
  TITLE,
18
  )
19
- from src.plots.plot_results import (
20
  create_metric_plot_obj,
21
  create_scores_df,
22
  create_plot_df,
23
  join_model_info_with_results,
24
  HUMAN_BASELINES,
25
  )
26
- from src.get_model_info.apply_metadata_to_df import DO_NOT_SUBMIT_MODELS, ModelType
27
- from src.get_model_info.get_metadata_from_hub import get_model_size
28
- from src.filters import check_model_card
29
- from src.get_model_info.utils import (
30
- AutoEvalColumn,
31
- EvalQueueColumn,
32
- fields,
33
- styled_error,
34
- styled_message,
35
- styled_warning,
36
- )
37
- from src.manage_collections import update_collections
38
- from src.load_from_hub import get_all_requested_models, get_evaluation_queue_df, get_leaderboard_df
39
- from src.filters import is_model_on_hub, user_submission_permission
40
-
41
- pd.set_option("display.precision", 1)
42
-
43
- # clone / pull the lmeh eval data
44
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
45
-
46
- QUEUE_REPO = "open-llm-leaderboard/requests"
47
- RESULTS_REPO = "open-llm-leaderboard/results"
48
-
49
- PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
50
- PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
51
-
52
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
53
-
54
- EVAL_REQUESTS_PATH = "eval-queue"
55
- EVAL_RESULTS_PATH = "eval-results"
56
-
57
- EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
58
- EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
59
-
60
- api = HfApi(token=H4_TOKEN)
61
 
62
 
63
  def restart_space():
64
- api.restart_space(repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN)
65
-
66
-
67
- # Rate limit variables
68
- RATE_LIMIT_PERIOD = 7
69
- RATE_LIMIT_QUOTA = 5
70
-
71
- # Column selection
72
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
73
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
74
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
75
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
76
-
77
- if not IS_PUBLIC:
78
- COLS.insert(2, AutoEvalColumn.precision.name)
79
- TYPES.insert(2, AutoEvalColumn.precision.type)
80
-
81
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
82
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
83
-
84
- BENCHMARK_COLS = [
85
- c.name
86
- for c in [
87
- AutoEvalColumn.arc,
88
- AutoEvalColumn.hellaswag,
89
- AutoEvalColumn.mmlu,
90
- AutoEvalColumn.truthfulqa,
91
- AutoEvalColumn.winogrande,
92
- AutoEvalColumn.gsm8k,
93
- AutoEvalColumn.drop
94
- ]
95
- ]
96
 
97
  try:
98
- snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 
 
99
  except Exception:
100
  restart_space()
101
  try:
102
- snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 
 
103
  except Exception:
104
  restart_space()
105
 
106
- requested_models, users_to_submission_dates = get_all_requested_models(EVAL_REQUESTS_PATH)
107
 
108
  original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
109
- update_collections(original_df.copy())
110
  leaderboard_df = original_df.copy()
111
 
112
- models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
113
- #plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
114
- to_be_dumped = f"models = {repr(models)}\n"
115
 
116
  (
117
  finished_eval_queue_df,
@@ -120,115 +72,6 @@ to_be_dumped = f"models = {repr(models)}\n"
120
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
121
 
122
 
123
- ## INTERACTION FUNCTIONS
124
- def add_new_eval(
125
- model: str,
126
- base_model: str,
127
- revision: str,
128
- precision: str,
129
- private: bool,
130
- weight_type: str,
131
- model_type: str,
132
- ):
133
- precision = precision.split(" ")[0]
134
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
135
-
136
- if model_type is None or model_type == "":
137
- return styled_error("Please select a model type.")
138
-
139
- # Is the user rate limited?
140
- user_can_submit, error_msg = user_submission_permission(model, users_to_submission_dates, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA)
141
- if not user_can_submit:
142
- return styled_error(error_msg)
143
-
144
- # Did the model authors forbid its submission to the leaderboard?
145
- if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
146
- return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
147
-
148
- # Does the model actually exist?
149
- if revision == "":
150
- revision = "main"
151
-
152
- if weight_type in ["Delta", "Adapter"]:
153
- base_model_on_hub, error = is_model_on_hub(base_model, revision, H4_TOKEN)
154
- if not base_model_on_hub:
155
- return styled_error(f'Base model "{base_model}" {error}')
156
-
157
- if not weight_type == "Adapter":
158
- model_on_hub, error = is_model_on_hub(model, revision)
159
- if not model_on_hub:
160
- return styled_error(f'Model "{model}" {error}')
161
-
162
- try:
163
- model_info = api.model_info(repo_id=model, revision=revision)
164
- except Exception:
165
- return styled_error("Could not get your model information. Please fill it up properly.")
166
-
167
- model_size = get_model_size(model_info=model_info , precision= precision)
168
-
169
- # Were the model card and license filled?
170
- try:
171
- license = model_info.cardData["license"]
172
- except Exception:
173
- return styled_error("Please select a license for your model")
174
-
175
- modelcard_OK, error_msg = check_model_card(model)
176
- if not modelcard_OK:
177
- return styled_error(error_msg)
178
-
179
- # Seems good, creating the eval
180
- print("Adding new eval")
181
-
182
- eval_entry = {
183
- "model": model,
184
- "base_model": base_model,
185
- "revision": revision,
186
- "private": private,
187
- "precision": precision,
188
- "weight_type": weight_type,
189
- "status": "PENDING",
190
- "submitted_time": current_time,
191
- "model_type": model_type,
192
- "likes": model_info.likes,
193
- "params": model_size,
194
- "license": license,
195
- }
196
-
197
- user_name = ""
198
- model_path = model
199
- if "/" in model:
200
- user_name = model.split("/")[0]
201
- model_path = model.split("/")[1]
202
-
203
- print("Creating eval file")
204
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
205
- os.makedirs(OUT_DIR, exist_ok=True)
206
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
207
-
208
- # Check for duplicate submission
209
- if f"{model}_{revision}_{precision}" in requested_models:
210
- return styled_warning("This model has been already submitted.")
211
-
212
- with open(out_path, "w") as f:
213
- f.write(json.dumps(eval_entry))
214
-
215
- print("Uploading eval file")
216
- api.upload_file(
217
- path_or_fileobj=out_path,
218
- path_in_repo=out_path.split("eval-queue/")[1],
219
- repo_id=QUEUE_REPO,
220
- repo_type="dataset",
221
- commit_message=f"Add {model} to eval queue",
222
- )
223
-
224
- # Remove the local file
225
- os.remove(out_path)
226
-
227
- return styled_message(
228
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
229
- )
230
-
231
-
232
  # Basics
233
  def change_tab(query_param: str):
234
  query_param = query_param.replace("'", '"')
@@ -272,18 +115,6 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
272
  return filtered_df
273
 
274
 
275
- NUMERIC_INTERVALS = {
276
- "?": pd.Interval(-1, 0, closed="right"),
277
- "~1.5": pd.Interval(0, 2, closed="right"),
278
- "~3": pd.Interval(2, 4, closed="right"),
279
- "~7": pd.Interval(4, 9, closed="right"),
280
- "~13": pd.Interval(9, 20, closed="right"),
281
- "~35": pd.Interval(20, 45, closed="right"),
282
- "~60": pd.Interval(45, 70, closed="right"),
283
- "70+": pd.Interval(70, 10000, closed="right"),
284
- }
285
-
286
-
287
  def filter_queries(query: str, filtered_df: pd.DataFrame):
288
  """Added by Abishek"""
289
  final_df = []
@@ -311,7 +142,7 @@ def filter_models(
311
  if show_deleted:
312
  filtered_df = df
313
  else: # Show only still on the hub models
314
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True]
315
 
316
  type_emoji = [t[0] for t in type_query]
317
  filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
@@ -342,54 +173,22 @@ with demo:
342
  )
343
  with gr.Row():
344
  shown_columns = gr.CheckboxGroup(
345
- choices=[
346
- c
347
- for c in COLS
348
- if c
349
- not in [
350
- AutoEvalColumn.dummy.name,
351
- AutoEvalColumn.model.name,
352
- AutoEvalColumn.model_type_symbol.name,
353
- AutoEvalColumn.still_on_hub.name,
354
- ]
355
- ],
356
- value=[
357
- c
358
- for c in COLS_LITE
359
- if c
360
- not in [
361
- AutoEvalColumn.dummy.name,
362
- AutoEvalColumn.model.name,
363
- AutoEvalColumn.model_type_symbol.name,
364
- AutoEvalColumn.still_on_hub.name,
365
- ]
366
- ],
367
  label="Select columns to show",
368
  elem_id="column-select",
369
  interactive=True,
370
  )
371
  with gr.Row():
372
  deleted_models_visibility = gr.Checkbox(
373
- value=True, label="Show gated/private/deleted models", interactive=True
374
  )
375
  with gr.Column(min_width=320):
376
  with gr.Box(elem_id="box-filter"):
377
  filter_columns_type = gr.CheckboxGroup(
378
  label="Model types",
379
- choices=[
380
- ModelType.PT.to_str(),
381
- ModelType.FT.to_str(),
382
- ModelType.IFT.to_str(),
383
- ModelType.RL.to_str(),
384
- ModelType.Unknown.to_str(),
385
- ],
386
- value=[
387
- ModelType.PT.to_str(),
388
- ModelType.FT.to_str(),
389
- ModelType.IFT.to_str(),
390
- ModelType.RL.to_str(),
391
- ModelType.Unknown.to_str(),
392
- ],
393
  interactive=True,
394
  elem_id="filter-columns-type",
395
  )
@@ -410,16 +209,11 @@ with demo:
410
 
411
  leaderboard_table = gr.components.Dataframe(
412
  value=leaderboard_df[
413
- [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name]
414
  + shown_columns.value
415
  + [AutoEvalColumn.dummy.name]
416
  ],
417
- headers=[
418
- AutoEvalColumn.model_type_symbol.name,
419
- AutoEvalColumn.model.name,
420
- ]
421
- + shown_columns.value
422
- + [AutoEvalColumn.dummy.name],
423
  datatype=TYPES,
424
  max_rows=None,
425
  elem_id="leaderboard-table",
@@ -429,7 +223,7 @@ with demo:
429
 
430
  # Dummy leaderboard for handling the case when the user uses backspace key
431
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
432
- value=original_df,
433
  headers=COLS,
434
  datatype=TYPES,
435
  max_rows=None,
@@ -519,7 +313,8 @@ with demo:
519
  queue=True,
520
  )
521
 
522
- # with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
 
523
  # with gr.Row():
524
  # with gr.Column():
525
  # chart = create_metric_plot_obj(
@@ -589,12 +384,7 @@ with demo:
589
  revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
590
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
591
  model_type = gr.Dropdown(
592
- choices=[
593
- ModelType.PT.to_str(" : "),
594
- ModelType.FT.to_str(" : "),
595
- ModelType.IFT.to_str(" : "),
596
- ModelType.RL.to_str(" : "),
597
- ],
598
  label="Model type",
599
  multiselect=False,
600
  value=None,
 
1
  import json
2
  import os
 
3
 
4
  import gradio as gr
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
+ from huggingface_hub import snapshot_download
8
+
9
+ from src.display.utils import (
10
+ COLS,
11
+ TYPES,
12
+ BENCHMARK_COLS,
13
+ EVAL_COLS,
14
+ EVAL_TYPES,
15
+ AutoEvalColumn,
16
+ ModelType,
17
+ NUMERIC_INTERVALS,
18
+ fields,
19
+ )
20
+ from src.display.css_html_js import custom_css, get_window_url_params
21
+ from src.display.about import (
22
  CITATION_BUTTON_LABEL,
23
  CITATION_BUTTON_TEXT,
24
  EVALUATION_QUEUE_TEXT,
 
26
  LLM_BENCHMARKS_TEXT,
27
  TITLE,
28
  )
29
+ from src.tools.plots import (
30
  create_metric_plot_obj,
31
  create_scores_df,
32
  create_plot_df,
33
  join_model_info_with_results,
34
  HUMAN_BASELINES,
35
  )
36
+ from src.tools.collections import update_collections
37
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
38
+ from src.envs import H4_TOKEN, QUEUE_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO, API, REPO_ID, IS_PUBLIC
39
+ from src.submission.submit import add_new_eval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
  def restart_space():
43
+ API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
44
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  try:
47
+ snapshot_download(
48
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
49
+ )
50
  except Exception:
51
  restart_space()
52
  try:
53
+ snapshot_download(
54
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
55
+ )
56
  except Exception:
57
  restart_space()
58
 
 
59
 
60
  original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
61
+ #update_collections(original_df.copy())
62
  leaderboard_df = original_df.copy()
63
 
64
+ #models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
65
+ # plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
66
+ #to_be_dumped = f"models = {repr(models)}\n"
67
 
68
  (
69
  finished_eval_queue_df,
 
72
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # Basics
76
  def change_tab(query_param: str):
77
  query_param = query_param.replace("'", '"')
 
115
  return filtered_df
116
 
117
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def filter_queries(query: str, filtered_df: pd.DataFrame):
119
  """Added by Abishek"""
120
  final_df = []
 
142
  if show_deleted:
143
  filtered_df = df
144
  else: # Show only still on the hub models
145
+ filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
146
 
147
  type_emoji = [t[0] for t in type_query]
148
  filtered_df = filtered_df[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
 
173
  )
174
  with gr.Row():
175
  shown_columns = gr.CheckboxGroup(
176
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.dummy],
177
+ value=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  label="Select columns to show",
179
  elem_id="column-select",
180
  interactive=True,
181
  )
182
  with gr.Row():
183
  deleted_models_visibility = gr.Checkbox(
184
+ value=False, label="Show gated/private/deleted models", interactive=True
185
  )
186
  with gr.Column(min_width=320):
187
  with gr.Box(elem_id="box-filter"):
188
  filter_columns_type = gr.CheckboxGroup(
189
  label="Model types",
190
+ choices=[t.to_str() for t in ModelType],
191
+ value=[t.to_str() for t in ModelType],
 
 
 
 
 
 
 
 
 
 
 
 
192
  interactive=True,
193
  elem_id="filter-columns-type",
194
  )
 
209
 
210
  leaderboard_table = gr.components.Dataframe(
211
  value=leaderboard_df[
212
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
213
  + shown_columns.value
214
  + [AutoEvalColumn.dummy.name]
215
  ],
216
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
 
 
 
 
 
217
  datatype=TYPES,
218
  max_rows=None,
219
  elem_id="leaderboard-table",
 
223
 
224
  # Dummy leaderboard for handling the case when the user uses backspace key
225
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
226
+ value=original_df[COLS],
227
  headers=COLS,
228
  datatype=TYPES,
229
  max_rows=None,
 
313
  queue=True,
314
  )
315
 
316
+ # with gr.TabItem("📈
317
+ # evolution through time", elem_id="llm-benchmark-tab-table", id=4):
318
  # with gr.Row():
319
  # with gr.Column():
320
  # chart = create_metric_plot_obj(
 
384
  revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
385
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
386
  model_type = gr.Dropdown(
387
+ choices=[t.to_str(" : ") for t in ModelType],
 
 
 
 
 
388
  label="Model type",
389
  multiselect=False,
390
  value=None,
model_info_cache.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:15ee9a3cdd3ffdfa4d46497b829fbb43ea5a66222a17d34dfef5ad1111a8eb18
3
- size 3789941
 
 
 
 
model_size_cache.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ace7167a258f711fa7ffeaadddc6ebef8ccb92651dce8b805228c2f18c988958
3
- size 75324
 
 
 
 
scripts/create_request_file.py CHANGED
@@ -10,10 +10,11 @@ import pprint
10
  EVAL_REQUESTS_PATH = "eval-queue"
11
  QUEUE_REPO = "open-llm-leaderboard/requests"
12
 
13
- precisions =("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
14
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
15
  weight_types = ("Original", "Delta", "Adapter")
16
 
 
17
  def get_model_size(model_info, precision: str):
18
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
19
  try:
@@ -24,12 +25,13 @@ def get_model_size(model_info, precision: str):
24
  model_size = size_match.group(0)
25
  model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
26
  except AttributeError:
27
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
28
 
29
  size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
30
  model_size = size_factor * model_size
31
  return model_size
32
 
 
33
  def main():
34
  api = HfApi()
35
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -49,7 +51,7 @@ def main():
49
  print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
50
  return 1
51
 
52
- model_size = get_model_size(model_info=model_info , precision=precision)
53
 
54
  try:
55
  license = model_info.cardData["license"]
@@ -98,7 +100,7 @@ def main():
98
  )
99
  else:
100
  click.echo("aborting...")
101
-
102
 
103
- if __name__ == '__main__':
104
- main()
 
 
10
  EVAL_REQUESTS_PATH = "eval-queue"
11
  QUEUE_REPO = "open-llm-leaderboard/requests"
12
 
13
+ precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
14
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
15
  weight_types = ("Original", "Delta", "Adapter")
16
 
17
+
18
  def get_model_size(model_info, precision: str):
19
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
20
  try:
 
25
  model_size = size_match.group(0)
26
  model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
27
  except AttributeError:
28
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
29
 
30
  size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
31
  model_size = size_factor * model_size
32
  return model_size
33
 
34
+
35
  def main():
36
  api = HfApi()
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 
51
  print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
52
  return 1
53
 
54
+ model_size = get_model_size(model_info=model_info, precision=precision)
55
 
56
  try:
57
  license = model_info.cardData["license"]
 
100
  )
101
  else:
102
  click.echo("aborting...")
 
103
 
104
+
105
+ if __name__ == "__main__":
106
+ main()
src/assets/hardcoded_evals.py DELETED
@@ -1,43 +0,0 @@
1
- from src.get_model_info.utils import AutoEvalColumn, model_hyperlink
2
-
3
- gpt4_values = {
4
- AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
5
- AutoEvalColumn.revision.name: "tech report",
6
- AutoEvalColumn.precision.name: None,
7
- AutoEvalColumn.average.name: 84.3,
8
- AutoEvalColumn.arc.name: 96.3,
9
- AutoEvalColumn.hellaswag.name: 95.3,
10
- AutoEvalColumn.mmlu.name: 86.4,
11
- AutoEvalColumn.truthfulqa.name: 59.0,
12
- AutoEvalColumn.dummy.name: "GPT-4",
13
- AutoEvalColumn.model_type.name: "",
14
- }
15
-
16
- gpt35_values = {
17
- AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
18
- AutoEvalColumn.revision.name: "tech report",
19
- AutoEvalColumn.precision.name: None,
20
- AutoEvalColumn.average.name: 71.9,
21
- AutoEvalColumn.arc.name: 85.2,
22
- AutoEvalColumn.hellaswag.name: 85.5,
23
- AutoEvalColumn.mmlu.name: 70.0,
24
- AutoEvalColumn.truthfulqa.name: 47.0,
25
- AutoEvalColumn.dummy.name: "GPT-3.5",
26
- AutoEvalColumn.model_type.name: "",
27
- }
28
-
29
- baseline = {
30
- AutoEvalColumn.model.name: "<p>Baseline</p>",
31
- AutoEvalColumn.revision.name: "N/A",
32
- AutoEvalColumn.precision.name: None,
33
- AutoEvalColumn.average.name: 25.0,
34
- AutoEvalColumn.arc.name: 25.0,
35
- AutoEvalColumn.hellaswag.name: 25.0,
36
- AutoEvalColumn.mmlu.name: 25.0,
37
- AutoEvalColumn.truthfulqa.name: 25.0,
38
- AutoEvalColumn.winogrande.name: 50.0,
39
- AutoEvalColumn.gsm8k.name: 0.21,
40
- AutoEvalColumn.drop.name: 0.47,
41
- AutoEvalColumn.dummy.name: "baseline",
42
- AutoEvalColumn.model_type.name: "",
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/assets/scale-hf-logo.png DELETED

Git LFS Details

  • SHA256: 11a263a1abe4c7c9cf022cbe052dc567dcea164bdfbc111299aae3270e992934
  • Pointer size: 132 Bytes
  • Size of remote file: 1.88 MB
src/{assets/text_content.py → display/about.py} RENAMED
@@ -1,4 +1,4 @@
1
- from src.get_model_info.hardocded_metadata.types import ModelType
2
 
3
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
@@ -42,7 +42,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
42
  ## Details and logs
43
  You can find:
44
  - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
45
- - details on the input/outputs for the models in the `details` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/details
46
  - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
47
 
48
  ## Reproducibility
 
1
+ from src.display.utils import ModelType
2
 
3
  TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
4
 
 
42
  ## Details and logs
43
  You can find:
44
  - detailed numerical results in the `results` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/results
45
+ - details on the input/outputs for the models in the `details` of each model, that you can access by clicking the 📄 emoji after the model name
46
  - community queries and running status in the `requests` Hugging Face dataset: https://huggingface.co/datasets/open-llm-leaderboard/requests
47
 
48
  ## Reproducibility
src/{assets → display}/css_html_js.py RENAMED
File without changes
src/{get_model_info/utils.py → display/formatting.py} RENAMED
@@ -1,68 +1,8 @@
1
  import os
2
- from dataclasses import dataclass
3
-
4
  from huggingface_hub import HfApi
5
 
6
  API = HfApi()
7
 
8
-
9
- # These classes are for user facing column names, to avoid having to change them
10
- # all around the code when a modif is needed
11
- @dataclass
12
- class ColumnContent:
13
- name: str
14
- type: str
15
- displayed_by_default: bool
16
- hidden: bool = False
17
-
18
-
19
- def fields(raw_class):
20
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
21
-
22
-
23
- @dataclass(frozen=True)
24
- class AutoEvalColumn: # Auto evals column
25
- model_type_symbol = ColumnContent("T", "str", True)
26
- model = ColumnContent("Model", "markdown", True)
27
- average = ColumnContent("Average ⬆️", "number", True)
28
- arc = ColumnContent("ARC", "number", True)
29
- hellaswag = ColumnContent("HellaSwag", "number", True)
30
- mmlu = ColumnContent("MMLU", "number", True)
31
- truthfulqa = ColumnContent("TruthfulQA", "number", True)
32
- winogrande = ColumnContent("Winogrande", "number", True)
33
- gsm8k = ColumnContent("GSM8K", "number", True)
34
- drop = ColumnContent("DROP", "number", True)
35
- model_type = ColumnContent("Type", "str", False)
36
- precision = ColumnContent("Precision", "str", False) # , True)
37
- license = ColumnContent("Hub License", "str", False)
38
- params = ColumnContent("#Params (B)", "number", False)
39
- likes = ColumnContent("Hub ❤️", "number", False)
40
- still_on_hub = ColumnContent("Available on the hub", "bool", False)
41
- revision = ColumnContent("Model sha", "str", False, False)
42
- dummy = ColumnContent(
43
- "model_name_for_query", "str", True
44
- ) # dummy col to implement search bar (hidden by custom CSS)
45
-
46
-
47
- @dataclass(frozen=True)
48
- class EloEvalColumn: # Elo evals column
49
- model = ColumnContent("Model", "markdown", True)
50
- gpt4 = ColumnContent("GPT-4 (all)", "number", True)
51
- human_all = ColumnContent("Human (all)", "number", True)
52
- human_instruct = ColumnContent("Human (instruct)", "number", True)
53
- human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
54
-
55
-
56
- @dataclass(frozen=True)
57
- class EvalQueueColumn: # Queue column
58
- model = ColumnContent("model", "markdown", True)
59
- revision = ColumnContent("revision", "str", True)
60
- private = ColumnContent("private", "bool", True)
61
- precision = ColumnContent("precision", "str", True)
62
- weight_type = ColumnContent("weight_type", "str", "Original")
63
- status = ColumnContent("status", "str", True)
64
-
65
-
66
  LLAMAS = [
67
  "huggingface/llama-7b",
68
  "huggingface/llama-13b",
@@ -70,7 +10,6 @@ LLAMAS = [
70
  "huggingface/llama-65b",
71
  ]
72
 
73
-
74
  KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
75
  VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
76
  OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
 
1
  import os
 
 
2
  from huggingface_hub import HfApi
3
 
4
  API = HfApi()
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  LLAMAS = [
7
  "huggingface/llama-7b",
8
  "huggingface/llama-13b",
 
10
  "huggingface/llama-65b",
11
  ]
12
 
 
13
  KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
14
  VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
15
  OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
src/display/utils.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import pandas as pd
3
+ from enum import Enum
4
+
5
+
6
+ # These classes are for user facing column names,
7
+ # to avoid having to change them all around the code
8
+ # when a modif is needed
9
+ @dataclass
10
+ class ColumnContent:
11
+ name: str
12
+ type: str
13
+ displayed_by_default: bool
14
+ hidden: bool = False
15
+ never_hidden: bool = False
16
+ dummy: bool = False
17
+
18
+
19
+ def fields(raw_class):
20
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class AutoEvalColumn: # Auto evals column
25
+ model_type_symbol = ColumnContent("T", "str", True, never_hidden=True)
26
+ model = ColumnContent("Model", "markdown", True, never_hidden=True)
27
+ average = ColumnContent("Average ⬆️", "number", True)
28
+ arc = ColumnContent("ARC", "number", True)
29
+ hellaswag = ColumnContent("HellaSwag", "number", True)
30
+ mmlu = ColumnContent("MMLU", "number", True)
31
+ truthfulqa = ColumnContent("TruthfulQA", "number", True)
32
+ winogrande = ColumnContent("Winogrande", "number", True)
33
+ gsm8k = ColumnContent("GSM8K", "number", True)
34
+ drop = ColumnContent("DROP", "number", True)
35
+ model_type = ColumnContent("Type", "str", False)
36
+ weight_type = ColumnContent("Weight type", "str", False, True)
37
+ precision = ColumnContent("Precision", "str", False) # , True)
38
+ license = ColumnContent("Hub License", "str", False)
39
+ params = ColumnContent("#Params (B)", "number", False)
40
+ likes = ColumnContent("Hub ❤️", "number", False)
41
+ still_on_hub = ColumnContent("Available on the hub", "bool", False)
42
+ revision = ColumnContent("Model sha", "str", False, False)
43
+ dummy = ColumnContent(
44
+ "model_name_for_query", "str", False, dummy=True
45
+ ) # dummy col to implement search bar (hidden by custom CSS)
46
+
47
+
48
+ @dataclass(frozen=True)
49
+ class EvalQueueColumn: # Queue column
50
+ model = ColumnContent("model", "markdown", True)
51
+ revision = ColumnContent("revision", "str", True)
52
+ private = ColumnContent("private", "bool", True)
53
+ precision = ColumnContent("precision", "str", True)
54
+ weight_type = ColumnContent("weight_type", "str", "Original")
55
+ status = ColumnContent("status", "str", True)
56
+
57
+
58
+ baseline_row = {
59
+ AutoEvalColumn.model.name: "<p>Baseline</p>",
60
+ AutoEvalColumn.revision.name: "N/A",
61
+ AutoEvalColumn.precision.name: None,
62
+ AutoEvalColumn.average.name: 25.0,
63
+ AutoEvalColumn.arc.name: 25.0,
64
+ AutoEvalColumn.hellaswag.name: 25.0,
65
+ AutoEvalColumn.mmlu.name: 25.0,
66
+ AutoEvalColumn.truthfulqa.name: 25.0,
67
+ AutoEvalColumn.winogrande.name: 50.0,
68
+ AutoEvalColumn.gsm8k.name: 0.21,
69
+ AutoEvalColumn.drop.name: 0.47,
70
+ AutoEvalColumn.dummy.name: "baseline",
71
+ AutoEvalColumn.model_type.name: "",
72
+ }
73
+
74
+
75
+ @dataclass
76
+ class ModelInfo:
77
+ name: str
78
+ symbol: str # emoji
79
+
80
+
81
+ class ModelType(Enum):
82
+ PT = ModelInfo(name="pretrained", symbol="🟢")
83
+ FT = ModelInfo(name="fine-tuned", symbol="🔶")
84
+ IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
85
+ RL = ModelInfo(name="RL-tuned", symbol="🟦")
86
+ Unknown = ModelInfo(name="", symbol="?")
87
+
88
+ def to_str(self, separator=" "):
89
+ return f"{self.value.symbol}{separator}{self.value.name}"
90
+
91
+ @staticmethod
92
+ def from_str(type):
93
+ if "fine-tuned" in type or "🔶" in type:
94
+ return ModelType.FT
95
+ if "pretrained" in type or "🟢" in type:
96
+ return ModelType.PT
97
+ if "RL-tuned" in type or "🟦" in type:
98
+ return ModelType.RL
99
+ if "instruction-tuned" in type or "⭕" in type:
100
+ return ModelType.IFT
101
+ return ModelType.Unknown
102
+
103
+
104
+ @dataclass
105
+ class Task:
106
+ benchmark: str
107
+ metric: str
108
+ col_name: str
109
+
110
+
111
+ class Tasks(Enum):
112
+ arc = Task("arc:challenge", "acc_norm", AutoEvalColumn.arc.name)
113
+ hellaswag = Task("hellaswag", "acc_norm", AutoEvalColumn.hellaswag.name)
114
+ mmlu = Task("hendrycksTest", "acc", AutoEvalColumn.mmlu.name)
115
+ truthfulqa = Task("truthfulqa:mc", "mc2", AutoEvalColumn.truthfulqa.name)
116
+ winogrande = Task("winogrande", "acc", AutoEvalColumn.winogrande.name)
117
+ gsm8k = Task("gsm8k", "acc", AutoEvalColumn.gsm8k.name)
118
+ drop = Task("drop", "f1", AutoEvalColumn.drop.name)
119
+
120
+
121
+ # Column selection
122
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
123
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
124
+ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
125
+ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
126
+
127
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
128
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
129
+
130
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.col_name in fields(AutoEvalColumn)]
131
+
132
+ NUMERIC_INTERVALS = {
133
+ "?": pd.Interval(-1, 0, closed="right"),
134
+ "~1.5": pd.Interval(0, 2, closed="right"),
135
+ "~3": pd.Interval(2, 4, closed="right"),
136
+ "~7": pd.Interval(4, 9, closed="right"),
137
+ "~13": pd.Interval(9, 20, closed="right"),
138
+ "~35": pd.Interval(20, 45, closed="right"),
139
+ "~60": pd.Interval(45, 70, closed="right"),
140
+ "70+": pd.Interval(70, 10000, closed="right"),
141
+ }
src/envs.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+
4
+ # clone / pull the lmeh eval data
5
+ H4_TOKEN = os.environ.get("H4_TOKEN", None)
6
+
7
+ REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
8
+ QUEUE_REPO = "open-llm-leaderboard/requests"
9
+ RESULTS_REPO = "open-llm-leaderboard/results"
10
+
11
+ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
12
+ PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
13
+
14
+ IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
15
+
16
+ EVAL_REQUESTS_PATH = "eval-queue"
17
+ EVAL_RESULTS_PATH = "eval-results"
18
+
19
+ EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
20
+ EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
21
+
22
+ PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
23
+
24
+ # Rate limit variables
25
+ RATE_LIMIT_PERIOD = 7
26
+ RATE_LIMIT_QUOTA = 5
27
+
28
+ API = HfApi(token=H4_TOKEN)
src/get_model_info/apply_metadata_to_df.py DELETED
@@ -1,95 +0,0 @@
1
- import glob
2
- import json
3
- import os
4
- from typing import List
5
-
6
- from huggingface_hub import HfApi
7
- from tqdm import tqdm
8
-
9
- from src.get_model_info.hardocded_metadata.flags import DO_NOT_SUBMIT_MODELS, FLAGGED_MODELS
10
- from src.get_model_info.hardocded_metadata.types import MODEL_TYPE_METADATA, ModelType, model_type_from_str
11
- from src.get_model_info.utils import AutoEvalColumn, model_hyperlink
12
-
13
- api = HfApi(token=os.environ.get("H4_TOKEN", None))
14
-
15
-
16
- def get_model_metadata(leaderboard_data: List[dict]):
17
- for model_data in tqdm(leaderboard_data):
18
- request_files = os.path.join(
19
- "eval-queue",
20
- model_data["model_name_for_query"] + "_eval_request_*" + ".json",
21
- )
22
- request_files = glob.glob(request_files)
23
-
24
- # Select correct request file (precision)
25
- request_file = ""
26
- if len(request_files) == 1:
27
- request_file = request_files[0]
28
- elif len(request_files) > 1:
29
- request_files = sorted(request_files, reverse=True)
30
- for tmp_request_file in request_files:
31
- with open(tmp_request_file, "r") as f:
32
- req_content = json.load(f)
33
- if (
34
- req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
35
- and req_content["precision"] == model_data["Precision"].split(".")[-1]
36
- ):
37
- request_file = tmp_request_file
38
-
39
- try:
40
- with open(request_file, "r") as f:
41
- request = json.load(f)
42
- model_type = model_type_from_str(request.get("model_type", ""))
43
- model_data[AutoEvalColumn.model_type.name] = model_type.value.name
44
- model_data[AutoEvalColumn.model_type_symbol.name] = model_type.value.symbol # + ("🔺" if is_delta else "")
45
- model_data[AutoEvalColumn.license.name] = request.get("license", "?")
46
- model_data[AutoEvalColumn.likes.name] = request.get("likes", 0)
47
- model_data[AutoEvalColumn.params.name] = request.get("params", 0)
48
- except Exception:
49
- print(f"Could not find request file for {model_data['model_name_for_query']}")
50
-
51
- if model_data["model_name_for_query"] in MODEL_TYPE_METADATA:
52
- model_data[AutoEvalColumn.model_type.name] = MODEL_TYPE_METADATA[
53
- model_data["model_name_for_query"]
54
- ].value.name
55
- model_data[AutoEvalColumn.model_type_symbol.name] = MODEL_TYPE_METADATA[
56
- model_data["model_name_for_query"]
57
- ].value.symbol # + ("🔺" if is_delta else "")
58
- else:
59
- model_data[AutoEvalColumn.model_type.name] = ModelType.Unknown.value.name
60
- model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.Unknown.value.symbol
61
-
62
- # if we cannot find a request file, set license and likes to unknown
63
- model_data[AutoEvalColumn.license.name] = "?"
64
- model_data[AutoEvalColumn.likes.name] = 0
65
- model_data[AutoEvalColumn.params.name] = 0
66
-
67
-
68
- def flag_models(leaderboard_data: List[dict]):
69
- for model_data in leaderboard_data:
70
- if model_data["model_name_for_query"] in FLAGGED_MODELS:
71
- issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
72
- issue_link = model_hyperlink(
73
- FLAGGED_MODELS[model_data["model_name_for_query"]],
74
- f"See discussion #{issue_num}",
75
- )
76
- model_data[
77
- AutoEvalColumn.model.name
78
- ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
79
-
80
-
81
- def remove_forbidden_models(leaderboard_data: List[dict]):
82
- indices_to_remove = []
83
- for ix, model in enumerate(leaderboard_data):
84
- if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
85
- indices_to_remove.append(ix)
86
-
87
- for ix in reversed(indices_to_remove):
88
- leaderboard_data.pop(ix)
89
- return leaderboard_data
90
-
91
-
92
- def apply_metadata(leaderboard_data: List[dict]):
93
- leaderboard_data = remove_forbidden_models(leaderboard_data)
94
- get_model_metadata(leaderboard_data)
95
- flag_models(leaderboard_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/get_model_info/get_metadata_from_hub.py DELETED
@@ -1,19 +0,0 @@
1
- import re
2
- from huggingface_hub.hf_api import ModelInfo
3
-
4
-
5
- def get_model_size(model_info: ModelInfo, precision: str):
6
- size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
7
- try:
8
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
9
- except AttributeError:
10
- try:
11
- size_match = re.search(size_pattern, model_info.modelId.lower())
12
- model_size = size_match.group(0)
13
- model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
14
- except AttributeError:
15
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
16
-
17
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
18
- model_size = size_factor * model_size
19
- return model_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/get_model_info/hardocded_metadata/types.py DELETED
@@ -1,555 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
- from typing import Dict
4
-
5
-
6
- @dataclass
7
- class ModelInfo:
8
- name: str
9
- symbol: str # emoji
10
-
11
-
12
- class ModelType(Enum):
13
- PT = ModelInfo(name="pretrained", symbol="🟢")
14
- FT = ModelInfo(name="fine-tuned", symbol="🔶")
15
- IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
16
- RL = ModelInfo(name="RL-tuned", symbol="🟦")
17
- Unknown = ModelInfo(name="", symbol="?")
18
-
19
- def to_str(self, separator=" "):
20
- return f"{self.value.symbol}{separator}{self.value.name}"
21
-
22
-
23
- MODEL_TYPE_METADATA: Dict[str, ModelType] = {
24
- "tiiuae/falcon-180B": ModelType.PT,
25
- "tiiuae/falcon-180B-chat": ModelType.RL,
26
- "microsoft/phi-1_5": ModelType.PT,
27
- "Qwen/Qwen-7B": ModelType.PT,
28
- "Qwen/Qwen-7B-Chat": ModelType.RL,
29
- "notstoic/PygmalionCoT-7b": ModelType.IFT,
30
- "aisquared/dlite-v1-355m": ModelType.IFT,
31
- "aisquared/dlite-v1-1_5b": ModelType.IFT,
32
- "aisquared/dlite-v1-774m": ModelType.IFT,
33
- "aisquared/dlite-v1-124m": ModelType.IFT,
34
- "aisquared/chopt-2_7b": ModelType.IFT,
35
- "aisquared/dlite-v2-124m": ModelType.IFT,
36
- "aisquared/dlite-v2-774m": ModelType.IFT,
37
- "aisquared/dlite-v2-1_5b": ModelType.IFT,
38
- "aisquared/chopt-1_3b": ModelType.IFT,
39
- "aisquared/dlite-v2-355m": ModelType.IFT,
40
- "augtoma/qCammel-13": ModelType.IFT,
41
- "Aspik101/Llama-2-7b-hf-instruct-pl-lora_unload": ModelType.IFT,
42
- "Aspik101/vicuna-7b-v1.3-instruct-pl-lora_unload": ModelType.IFT,
43
- "TheBloke/alpaca-lora-65B-HF": ModelType.FT,
44
- "TheBloke/tulu-7B-fp16": ModelType.IFT,
45
- "TheBloke/guanaco-7B-HF": ModelType.FT,
46
- "TheBloke/koala-7B-HF": ModelType.FT,
47
- "TheBloke/wizardLM-7B-HF": ModelType.IFT,
48
- "TheBloke/airoboros-13B-HF": ModelType.IFT,
49
- "TheBloke/koala-13B-HF": ModelType.FT,
50
- "TheBloke/Wizard-Vicuna-7B-Uncensored-HF": ModelType.FT,
51
- "TheBloke/dromedary-65b-lora-HF": ModelType.IFT,
52
- "TheBloke/wizardLM-13B-1.0-fp16": ModelType.IFT,
53
- "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16": ModelType.FT,
54
- "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16": ModelType.FT,
55
- "TheBloke/wizard-vicuna-13B-HF": ModelType.IFT,
56
- "TheBloke/UltraLM-13B-fp16": ModelType.IFT,
57
- "TheBloke/OpenAssistant-FT-7-Llama-30B-HF": ModelType.FT,
58
- "TheBloke/vicuna-13B-1.1-HF": ModelType.IFT,
59
- "TheBloke/guanaco-13B-HF": ModelType.FT,
60
- "TheBloke/guanaco-65B-HF": ModelType.FT,
61
- "TheBloke/airoboros-7b-gpt4-fp16": ModelType.IFT,
62
- "TheBloke/llama-30b-supercot-SuperHOT-8K-fp16": ModelType.IFT,
63
- "TheBloke/Llama-2-13B-fp16": ModelType.PT,
64
- "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16": ModelType.FT,
65
- "TheBloke/landmark-attention-llama7b-fp16": ModelType.IFT,
66
- "TheBloke/Planner-7B-fp16": ModelType.IFT,
67
- "TheBloke/Wizard-Vicuna-13B-Uncensored-HF": ModelType.FT,
68
- "TheBloke/gpt4-alpaca-lora-13B-HF": ModelType.IFT,
69
- "TheBloke/gpt4-x-vicuna-13B-HF": ModelType.IFT,
70
- "TheBloke/gpt4-alpaca-lora_mlp-65B-HF": ModelType.IFT,
71
- "TheBloke/tulu-13B-fp16": ModelType.IFT,
72
- "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16": ModelType.IFT,
73
- "TheBloke/Llama-2-70B-fp16": ModelType.IFT,
74
- "TheBloke/WizardLM-30B-fp16": ModelType.IFT,
75
- "TheBloke/robin-13B-v2-fp16": ModelType.FT,
76
- "TheBloke/robin-33B-v2-fp16": ModelType.FT,
77
- "TheBloke/Vicuna-13B-CoT-fp16": ModelType.IFT,
78
- "TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16": ModelType.IFT,
79
- "TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16": ModelType.FT,
80
- "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16": ModelType.IFT,
81
- "TheBloke/GPlatty-30B-SuperHOT-8K-fp16": ModelType.FT,
82
- "TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16": ModelType.IFT,
83
- "TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16": ModelType.IFT,
84
- "jphme/orca_mini_v2_ger_7b": ModelType.IFT,
85
- "Ejafa/vicuna_7B_vanilla_1.1": ModelType.FT,
86
- "kevinpro/Vicuna-13B-CoT": ModelType.IFT,
87
- "AlekseyKorshuk/pygmalion-6b-vicuna-chatml": ModelType.FT,
88
- "AlekseyKorshuk/chatml-pyg-v1": ModelType.FT,
89
- "concedo/Vicuzard-30B-Uncensored": ModelType.FT,
90
- "concedo/OPT-19M-ChatSalad": ModelType.FT,
91
- "concedo/Pythia-70M-ChatSalad": ModelType.FT,
92
- "digitous/13B-HyperMantis": ModelType.IFT,
93
- "digitous/Adventien-GPTJ": ModelType.FT,
94
- "digitous/Alpacino13b": ModelType.IFT,
95
- "digitous/GPT-R": ModelType.IFT,
96
- "digitous/Javelin-R": ModelType.IFT,
97
- "digitous/Javalion-GPTJ": ModelType.IFT,
98
- "digitous/Javalion-R": ModelType.IFT,
99
- "digitous/Skegma-GPTJ": ModelType.FT,
100
- "digitous/Alpacino30b": ModelType.IFT,
101
- "digitous/Janin-GPTJ": ModelType.FT,
102
- "digitous/Janin-R": ModelType.FT,
103
- "digitous/Javelin-GPTJ": ModelType.FT,
104
- "SaylorTwift/gpt2_test": ModelType.PT,
105
- "anton-l/gpt-j-tiny-random": ModelType.FT,
106
- "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca": ModelType.FT,
107
- "Lazycuber/pyg-instruct-wizardlm": ModelType.FT,
108
- "Lazycuber/Janemalion-6B": ModelType.FT,
109
- "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1": ModelType.FT,
110
- "IDEA-CCNL/Ziya-LLaMA-13B-v1": ModelType.IFT,
111
- "dsvv-cair/alpaca-cleaned-llama-30b-bf16": ModelType.FT,
112
- "gpt2-medium": ModelType.PT,
113
- "camel-ai/CAMEL-13B-Combined-Data": ModelType.IFT,
114
- "camel-ai/CAMEL-13B-Role-Playing-Data": ModelType.FT,
115
- "camel-ai/CAMEL-33B-Combined-Data": ModelType.IFT,
116
- "PygmalionAI/pygmalion-6b": ModelType.FT,
117
- "PygmalionAI/metharme-1.3b": ModelType.IFT,
118
- "PygmalionAI/pygmalion-1.3b": ModelType.FT,
119
- "PygmalionAI/pygmalion-350m": ModelType.FT,
120
- "PygmalionAI/pygmalion-2.7b": ModelType.FT,
121
- "medalpaca/medalpaca-7b": ModelType.FT,
122
- "lilloukas/Platypus-30B": ModelType.IFT,
123
- "lilloukas/GPlatty-30B": ModelType.FT,
124
- "mncai/chatdoctor": ModelType.FT,
125
- "chaoyi-wu/MedLLaMA_13B": ModelType.FT,
126
- "LoupGarou/WizardCoder-Guanaco-15B-V1.0": ModelType.IFT,
127
- "LoupGarou/WizardCoder-Guanaco-15B-V1.1": ModelType.FT,
128
- "hakurei/instruct-12b": ModelType.IFT,
129
- "hakurei/lotus-12B": ModelType.FT,
130
- "shibing624/chinese-llama-plus-13b-hf": ModelType.IFT,
131
- "shibing624/chinese-alpaca-plus-7b-hf": ModelType.IFT,
132
- "shibing624/chinese-alpaca-plus-13b-hf": ModelType.IFT,
133
- "mosaicml/mpt-7b-instruct": ModelType.IFT,
134
- "mosaicml/mpt-30b-chat": ModelType.IFT,
135
- "mosaicml/mpt-7b-storywriter": ModelType.FT,
136
- "mosaicml/mpt-30b-instruct": ModelType.IFT,
137
- "mosaicml/mpt-7b-chat": ModelType.IFT,
138
- "mosaicml/mpt-30b": ModelType.PT,
139
- "Corianas/111m": ModelType.IFT,
140
- "Corianas/Quokka_1.3b": ModelType.IFT,
141
- "Corianas/256_5epoch": ModelType.FT,
142
- "Corianas/Quokka_256m": ModelType.IFT,
143
- "Corianas/Quokka_590m": ModelType.IFT,
144
- "Corianas/gpt-j-6B-Dolly": ModelType.FT,
145
- "Corianas/Quokka_2.7b": ModelType.IFT,
146
- "cyberagent/open-calm-7b": ModelType.FT,
147
- "Aspik101/Nous-Hermes-13b-pl-lora_unload": ModelType.IFT,
148
- "THUDM/chatglm2-6b": ModelType.IFT,
149
- "MetaIX/GPT4-X-Alpasta-30b": ModelType.IFT,
150
- "NYTK/PULI-GPTrio": ModelType.PT,
151
- "EleutherAI/pythia-1.3b": ModelType.PT,
152
- "EleutherAI/pythia-2.8b-deduped": ModelType.PT,
153
- "EleutherAI/gpt-neo-125m": ModelType.PT,
154
- "EleutherAI/pythia-160m": ModelType.PT,
155
- "EleutherAI/gpt-neo-2.7B": ModelType.PT,
156
- "EleutherAI/pythia-1b-deduped": ModelType.PT,
157
- "EleutherAI/pythia-6.7b": ModelType.PT,
158
- "EleutherAI/pythia-70m-deduped": ModelType.PT,
159
- "EleutherAI/gpt-neox-20b": ModelType.PT,
160
- "EleutherAI/pythia-1.4b-deduped": ModelType.PT,
161
- "EleutherAI/pythia-2.7b": ModelType.PT,
162
- "EleutherAI/pythia-6.9b-deduped": ModelType.PT,
163
- "EleutherAI/pythia-70m": ModelType.PT,
164
- "EleutherAI/gpt-j-6b": ModelType.PT,
165
- "EleutherAI/pythia-12b-deduped": ModelType.PT,
166
- "EleutherAI/gpt-neo-1.3B": ModelType.PT,
167
- "EleutherAI/pythia-410m-deduped": ModelType.PT,
168
- "EleutherAI/pythia-160m-deduped": ModelType.PT,
169
- "EleutherAI/polyglot-ko-12.8b": ModelType.PT,
170
- "EleutherAI/pythia-12b": ModelType.PT,
171
- "roneneldan/TinyStories-33M": ModelType.PT,
172
- "roneneldan/TinyStories-28M": ModelType.PT,
173
- "roneneldan/TinyStories-1M": ModelType.PT,
174
- "roneneldan/TinyStories-8M": ModelType.PT,
175
- "roneneldan/TinyStories-3M": ModelType.PT,
176
- "jerryjalapeno/nart-100k-7b": ModelType.FT,
177
- "lmsys/vicuna-13b-v1.3": ModelType.IFT,
178
- "lmsys/vicuna-7b-v1.3": ModelType.IFT,
179
- "lmsys/vicuna-13b-v1.1": ModelType.IFT,
180
- "lmsys/vicuna-13b-delta-v1.1": ModelType.IFT,
181
- "lmsys/vicuna-7b-delta-v1.1": ModelType.IFT,
182
- "abhiramtirumala/DialoGPT-sarcastic-medium": ModelType.FT,
183
- "haonan-li/bactrian-x-llama-13b-merged": ModelType.IFT,
184
- "Gryphe/MythoLogic-13b": ModelType.IFT,
185
- "Gryphe/MythoBoros-13b": ModelType.IFT,
186
- "pillowtalks-ai/delta13b": ModelType.FT,
187
- "wannaphong/openthaigpt-0.1.0-beta-full-model_for_open_llm_leaderboard": ModelType.FT,
188
- "bigscience/bloom-7b1": ModelType.PT,
189
- "bigcode/tiny_starcoder_py": ModelType.PT,
190
- "bigcode/starcoderplus": ModelType.FT,
191
- "bigcode/gpt_bigcode-santacoder": ModelType.PT,
192
- "bigcode/starcoder": ModelType.PT,
193
- "Open-Orca/OpenOrca-Preview1-13B": ModelType.IFT,
194
- "microsoft/DialoGPT-large": ModelType.FT,
195
- "microsoft/DialoGPT-small": ModelType.FT,
196
- "microsoft/DialoGPT-medium": ModelType.FT,
197
- "microsoft/CodeGPT-small-py": ModelType.FT,
198
- "Tincando/fiction_story_generator": ModelType.FT,
199
- "Pirr/pythia-13b-deduped-green_devil": ModelType.FT,
200
- "Aeala/GPT4-x-AlpacaDente2-30b": ModelType.FT,
201
- "Aeala/GPT4-x-AlpacaDente-30b": ModelType.FT,
202
- "Aeala/GPT4-x-Alpasta-13b": ModelType.FT,
203
- "Aeala/VicUnlocked-alpaca-30b": ModelType.IFT,
204
- "Tap-M/Luna-AI-Llama2-Uncensored": ModelType.FT,
205
- "illuin/test-custom-llama": ModelType.FT,
206
- "dvruette/oasst-llama-13b-2-epochs": ModelType.FT,
207
- "dvruette/oasst-gpt-neox-20b-1000-steps": ModelType.FT,
208
- "dvruette/llama-13b-pretrained-dropout": ModelType.PT,
209
- "dvruette/llama-13b-pretrained": ModelType.PT,
210
- "dvruette/llama-13b-pretrained-sft-epoch-1": ModelType.FT,
211
- "dvruette/llama-13b-pretrained-sft-do2": ModelType.FT,
212
- "dvruette/oasst-gpt-neox-20b-3000-steps": ModelType.FT,
213
- "dvruette/oasst-pythia-12b-pretrained-sft": ModelType.FT,
214
- "dvruette/oasst-pythia-6.9b-4000-steps": ModelType.FT,
215
- "dvruette/gpt-neox-20b-full-precision": ModelType.FT,
216
- "dvruette/oasst-llama-13b-1000-steps": ModelType.FT,
217
- "openlm-research/open_llama_7b_700bt_preview": ModelType.PT,
218
- "openlm-research/open_llama_7b": ModelType.PT,
219
- "openlm-research/open_llama_7b_v2": ModelType.PT,
220
- "openlm-research/open_llama_3b": ModelType.PT,
221
- "openlm-research/open_llama_13b": ModelType.PT,
222
- "openlm-research/open_llama_3b_v2": ModelType.PT,
223
- "PocketDoc/Dans-PileOfSets-Mk1-llama-13b-merged": ModelType.IFT,
224
- "GeorgiaTechResearchInstitute/galpaca-30b": ModelType.IFT,
225
- "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct": ModelType.IFT,
226
- "databricks/dolly-v2-7b": ModelType.IFT,
227
- "databricks/dolly-v2-3b": ModelType.IFT,
228
- "databricks/dolly-v2-12b": ModelType.IFT,
229
- "Rachneet/gpt2-xl-alpaca": ModelType.FT,
230
- "Locutusque/gpt2-conversational-or-qa": ModelType.FT,
231
- "psyche/kogpt": ModelType.FT,
232
- "NbAiLab/nb-gpt-j-6B-alpaca": ModelType.IFT,
233
- "Mikael110/llama-2-7b-guanaco-fp16": ModelType.FT,
234
- "Mikael110/llama-2-13b-guanaco-fp16": ModelType.FT,
235
- "Fredithefish/CrimsonPajama": ModelType.IFT,
236
- "Fredithefish/RedPajama-INCITE-Chat-3B-ShareGPT-11K": ModelType.FT,
237
- "Fredithefish/ScarletPajama-3B-HF": ModelType.FT,
238
- "Fredithefish/RedPajama-INCITE-Chat-3B-Instruction-Tuning-with-GPT-4": ModelType.IFT,
239
- "acrastt/RedPajama-INCITE-Chat-Instruct-3B-V1": ModelType.IFT,
240
- "eachadea/vicuna-13b-1.1": ModelType.FT,
241
- "eachadea/vicuna-7b-1.1": ModelType.FT,
242
- "eachadea/vicuna-13b": ModelType.FT,
243
- "openaccess-ai-collective/wizard-mega-13b": ModelType.IFT,
244
- "openaccess-ai-collective/manticore-13b": ModelType.IFT,
245
- "openaccess-ai-collective/manticore-30b-chat-pyg-alpha": ModelType.IFT,
246
- "openaccess-ai-collective/minotaur-13b": ModelType.IFT,
247
- "openaccess-ai-collective/minotaur-13b-fixed": ModelType.IFT,
248
- "openaccess-ai-collective/hippogriff-30b-chat": ModelType.IFT,
249
- "openaccess-ai-collective/manticore-13b-chat-pyg": ModelType.IFT,
250
- "pythainlp/wangchanglm-7.5B-sft-enth": ModelType.IFT,
251
- "pythainlp/wangchanglm-7.5B-sft-en-sharded": ModelType.IFT,
252
- "euclaise/gpt-neox-122m-minipile-digits": ModelType.FT,
253
- "stabilityai/StableBeluga1-Delta": ModelType.IFT,
254
- "stabilityai/stablelm-tuned-alpha-7b": ModelType.IFT,
255
- "stabilityai/StableBeluga2": ModelType.IFT,
256
- "stabilityai/StableBeluga-13B": ModelType.IFT,
257
- "stabilityai/StableBeluga-7B": ModelType.IFT,
258
- "stabilityai/stablelm-base-alpha-7b": ModelType.PT,
259
- "stabilityai/stablelm-base-alpha-3b": ModelType.PT,
260
- "stabilityai/stablelm-tuned-alpha-3b": ModelType.IFT,
261
- "alibidaran/medical_transcription_generator": ModelType.FT,
262
- "CalderaAI/30B-Lazarus": ModelType.IFT,
263
- "CalderaAI/13B-BlueMethod": ModelType.IFT,
264
- "CalderaAI/13B-Ouroboros": ModelType.IFT,
265
- "KoboldAI/OPT-13B-Erebus": ModelType.FT,
266
- "KoboldAI/GPT-J-6B-Janeway": ModelType.FT,
267
- "KoboldAI/GPT-J-6B-Shinen": ModelType.FT,
268
- "KoboldAI/fairseq-dense-2.7B": ModelType.PT,
269
- "KoboldAI/OPT-6B-nerys-v2": ModelType.FT,
270
- "KoboldAI/GPT-NeoX-20B-Skein": ModelType.FT,
271
- "KoboldAI/PPO_Pygway-6b-Mix": ModelType.FT,
272
- "KoboldAI/fairseq-dense-6.7B": ModelType.PT,
273
- "KoboldAI/fairseq-dense-125M": ModelType.PT,
274
- "KoboldAI/OPT-13B-Nerybus-Mix": ModelType.FT,
275
- "KoboldAI/OPT-2.7B-Erebus": ModelType.FT,
276
- "KoboldAI/OPT-350M-Nerys-v2": ModelType.FT,
277
- "KoboldAI/OPT-2.7B-Nerys-v2": ModelType.FT,
278
- "KoboldAI/OPT-2.7B-Nerybus-Mix": ModelType.FT,
279
- "KoboldAI/OPT-13B-Nerys-v2": ModelType.FT,
280
- "KoboldAI/GPT-NeoX-20B-Erebus": ModelType.FT,
281
- "KoboldAI/OPT-6.7B-Erebus": ModelType.FT,
282
- "KoboldAI/fairseq-dense-355M": ModelType.PT,
283
- "KoboldAI/OPT-6.7B-Nerybus-Mix": ModelType.FT,
284
- "KoboldAI/GPT-J-6B-Adventure": ModelType.FT,
285
- "KoboldAI/OPT-350M-Erebus": ModelType.FT,
286
- "KoboldAI/GPT-J-6B-Skein": ModelType.FT,
287
- "KoboldAI/OPT-30B-Erebus": ModelType.FT,
288
- "klosax/pythia-160m-deduped-step92k-193bt": ModelType.PT,
289
- "klosax/open_llama_3b_350bt_preview": ModelType.PT,
290
- "klosax/openllama-3b-350bt": ModelType.PT,
291
- "klosax/pythia-70m-deduped-step44k-92bt": ModelType.PT,
292
- "klosax/open_llama_13b_600bt_preview": ModelType.PT,
293
- "klosax/open_llama_7b_400bt_preview": ModelType.PT,
294
- "kfkas/Llama-2-ko-7b-Chat": ModelType.IFT,
295
- "WeOpenML/Alpaca-7B-v1": ModelType.IFT,
296
- "WeOpenML/PandaLM-Alpaca-7B-v1": ModelType.IFT,
297
- "TFLai/gpt2-turkish-uncased": ModelType.FT,
298
- "ehartford/WizardLM-13B-Uncensored": ModelType.IFT,
299
- "ehartford/dolphin-llama-13b": ModelType.IFT,
300
- "ehartford/Wizard-Vicuna-30B-Uncensored": ModelType.FT,
301
- "ehartford/WizardLM-30B-Uncensored": ModelType.IFT,
302
- "ehartford/Wizard-Vicuna-13B-Uncensored": ModelType.FT,
303
- "ehartford/WizardLM-7B-Uncensored": ModelType.IFT,
304
- "ehartford/based-30b": ModelType.FT,
305
- "ehartford/Wizard-Vicuna-7B-Uncensored": ModelType.FT,
306
- "wahaha1987/llama_7b_sharegpt94k_fastchat": ModelType.FT,
307
- "wahaha1987/llama_13b_sharegpt94k_fastchat": ModelType.FT,
308
- "OpenAssistant/oasst-sft-1-pythia-12b": ModelType.FT,
309
- "OpenAssistant/stablelm-7b-sft-v7-epoch-3": ModelType.IFT,
310
- "OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5": ModelType.FT,
311
- "OpenAssistant/pythia-12b-sft-v8-2.5k-steps": ModelType.IFT,
312
- "OpenAssistant/pythia-12b-sft-v8-7k-steps": ModelType.IFT,
313
- "OpenAssistant/pythia-12b-pre-v8-12.5k-steps": ModelType.IFT,
314
- "OpenAssistant/llama2-13b-orca-8k-3319": ModelType.IFT,
315
- "junelee/wizard-vicuna-13b": ModelType.FT,
316
- "BreadAi/gpt-YA-1-1_160M": ModelType.PT,
317
- "BreadAi/MuseCan": ModelType.PT,
318
- "BreadAi/MusePy-1-2": ModelType.PT,
319
- "BreadAi/DiscordPy": ModelType.PT,
320
- "BreadAi/PM_modelV2": ModelType.PT,
321
- "BreadAi/gpt-Youtube": ModelType.PT,
322
- "BreadAi/StoryPy": ModelType.FT,
323
- "julianweng/Llama-2-7b-chat-orcah": ModelType.FT,
324
- "AGI-inc/lora_moe_7b_baseline": ModelType.FT,
325
- "AGI-inc/lora_moe_7b": ModelType.FT,
326
- "togethercomputer/GPT-NeoXT-Chat-Base-20B": ModelType.IFT,
327
- "togethercomputer/RedPajama-INCITE-Chat-7B-v0.1": ModelType.IFT,
328
- "togethercomputer/RedPajama-INCITE-Instruct-7B-v0.1": ModelType.IFT,
329
- "togethercomputer/RedPajama-INCITE-7B-Base": ModelType.PT,
330
- "togethercomputer/RedPajama-INCITE-7B-Instruct": ModelType.IFT,
331
- "togethercomputer/RedPajama-INCITE-Base-3B-v1": ModelType.PT,
332
- "togethercomputer/Pythia-Chat-Base-7B": ModelType.IFT,
333
- "togethercomputer/RedPajama-INCITE-Base-7B-v0.1": ModelType.PT,
334
- "togethercomputer/GPT-JT-6B-v1": ModelType.IFT,
335
- "togethercomputer/GPT-JT-6B-v0": ModelType.IFT,
336
- "togethercomputer/RedPajama-INCITE-Chat-3B-v1": ModelType.IFT,
337
- "togethercomputer/RedPajama-INCITE-7B-Chat": ModelType.IFT,
338
- "togethercomputer/RedPajama-INCITE-Instruct-3B-v1": ModelType.IFT,
339
- "Writer/camel-5b-hf": ModelType.IFT,
340
- "Writer/palmyra-base": ModelType.PT,
341
- "MBZUAI/LaMini-GPT-1.5B": ModelType.IFT,
342
- "MBZUAI/lamini-cerebras-111m": ModelType.IFT,
343
- "MBZUAI/lamini-neo-1.3b": ModelType.IFT,
344
- "MBZUAI/lamini-cerebras-1.3b": ModelType.IFT,
345
- "MBZUAI/lamini-cerebras-256m": ModelType.IFT,
346
- "MBZUAI/LaMini-GPT-124M": ModelType.IFT,
347
- "MBZUAI/lamini-neo-125m": ModelType.IFT,
348
- "TehVenom/DiffMerge-DollyGPT-Pygmalion": ModelType.FT,
349
- "TehVenom/PPO_Shygmalion-6b": ModelType.FT,
350
- "TehVenom/Dolly_Shygmalion-6b-Dev_V8P2": ModelType.FT,
351
- "TehVenom/Pygmalion_AlpacaLora-7b": ModelType.FT,
352
- "TehVenom/PPO_Pygway-V8p4_Dev-6b": ModelType.FT,
353
- "TehVenom/Dolly_Malion-6b": ModelType.FT,
354
- "TehVenom/PPO_Shygmalion-V8p4_Dev-6b": ModelType.FT,
355
- "TehVenom/ChanMalion": ModelType.FT,
356
- "TehVenom/GPT-J-Pyg_PPO-6B": ModelType.IFT,
357
- "TehVenom/Pygmalion-13b-Merged": ModelType.FT,
358
- "TehVenom/Metharme-13b-Merged": ModelType.IFT,
359
- "TehVenom/Dolly_Shygmalion-6b": ModelType.FT,
360
- "TehVenom/GPT-J-Pyg_PPO-6B-Dev-V8p4": ModelType.IFT,
361
- "georgesung/llama2_7b_chat_uncensored": ModelType.FT,
362
- "vicgalle/gpt2-alpaca": ModelType.IFT,
363
- "vicgalle/alpaca-7b": ModelType.FT,
364
- "vicgalle/gpt2-alpaca-gpt4": ModelType.IFT,
365
- "facebook/opt-350m": ModelType.PT,
366
- "facebook/opt-125m": ModelType.PT,
367
- "facebook/xglm-4.5B": ModelType.PT,
368
- "facebook/opt-2.7b": ModelType.PT,
369
- "facebook/opt-6.7b": ModelType.PT,
370
- "facebook/galactica-30b": ModelType.PT,
371
- "facebook/opt-13b": ModelType.PT,
372
- "facebook/opt-66b": ModelType.PT,
373
- "facebook/xglm-7.5B": ModelType.PT,
374
- "facebook/xglm-564M": ModelType.PT,
375
- "facebook/opt-30b": ModelType.PT,
376
- "golaxy/gogpt-7b": ModelType.FT,
377
- "golaxy/gogpt2-7b": ModelType.FT,
378
- "golaxy/gogpt-7b-bloom": ModelType.FT,
379
- "golaxy/gogpt-3b-bloom": ModelType.FT,
380
- "psmathur/orca_mini_v2_7b": ModelType.IFT,
381
- "psmathur/orca_mini_7b": ModelType.IFT,
382
- "psmathur/orca_mini_3b": ModelType.IFT,
383
- "psmathur/orca_mini_v2_13b": ModelType.IFT,
384
- "gpt2-xl": ModelType.PT,
385
- "lxe/Cerebras-GPT-2.7B-Alpaca-SP": ModelType.FT,
386
- "Monero/Manticore-13b-Chat-Pyg-Guanaco": ModelType.FT,
387
- "Monero/WizardLM-Uncensored-SuperCOT-StoryTelling-30b": ModelType.IFT,
388
- "Monero/WizardLM-13b-OpenAssistant-Uncensored": ModelType.IFT,
389
- "Monero/WizardLM-30B-Uncensored-Guanaco-SuperCOT-30b": ModelType.IFT,
390
- "jzjiao/opt-1.3b-rlhf": ModelType.FT,
391
- "HuggingFaceH4/starchat-beta": ModelType.IFT,
392
- "KnutJaegersberg/gpt-2-xl-EvolInstruct": ModelType.IFT,
393
- "KnutJaegersberg/megatron-GPT-2-345m-EvolInstruct": ModelType.IFT,
394
- "KnutJaegersberg/galactica-orca-wizardlm-1.3b": ModelType.IFT,
395
- "openchat/openchat_8192": ModelType.IFT,
396
- "openchat/openchat_v2": ModelType.IFT,
397
- "openchat/openchat_v2_w": ModelType.IFT,
398
- "ausboss/llama-13b-supercot": ModelType.IFT,
399
- "ausboss/llama-30b-supercot": ModelType.IFT,
400
- "Neko-Institute-of-Science/metharme-7b": ModelType.IFT,
401
- "Neko-Institute-of-Science/pygmalion-7b": ModelType.FT,
402
- "SebastianSchramm/Cerebras-GPT-111M-instruction": ModelType.IFT,
403
- "victor123/WizardLM-13B-1.0": ModelType.IFT,
404
- "OpenBuddy/openbuddy-openllama-13b-v7-fp16": ModelType.FT,
405
- "OpenBuddy/openbuddy-llama2-13b-v8.1-fp16": ModelType.FT,
406
- "OpenBuddyEA/openbuddy-llama-30b-v7.1-bf16": ModelType.FT,
407
- "baichuan-inc/Baichuan-7B": ModelType.PT,
408
- "tiiuae/falcon-40b-instruct": ModelType.IFT,
409
- "tiiuae/falcon-40b": ModelType.PT,
410
- "tiiuae/falcon-7b": ModelType.PT,
411
- "YeungNLP/firefly-llama-13b": ModelType.FT,
412
- "YeungNLP/firefly-llama-13b-v1.2": ModelType.FT,
413
- "YeungNLP/firefly-llama2-13b": ModelType.FT,
414
- "YeungNLP/firefly-ziya-13b": ModelType.FT,
415
- "shaohang/Sparse0.5_OPT-1.3": ModelType.FT,
416
- "xzuyn/Alpacino-SuperCOT-13B": ModelType.IFT,
417
- "xzuyn/MedicWizard-7B": ModelType.FT,
418
- "xDAN-AI/xDAN_13b_l2_lora": ModelType.FT,
419
- "beomi/KoAlpaca-Polyglot-5.8B": ModelType.FT,
420
- "beomi/llama-2-ko-7b": ModelType.IFT,
421
- "Salesforce/codegen-6B-multi": ModelType.PT,
422
- "Salesforce/codegen-16B-nl": ModelType.PT,
423
- "Salesforce/codegen-6B-nl": ModelType.PT,
424
- "ai-forever/rugpt3large_based_on_gpt2": ModelType.FT,
425
- "gpt2-large": ModelType.PT,
426
- "frank098/orca_mini_3b_juniper": ModelType.FT,
427
- "frank098/WizardLM_13B_juniper": ModelType.FT,
428
- "FPHam/Free_Sydney_13b_HF": ModelType.FT,
429
- "huggingface/llama-13b": ModelType.PT,
430
- "huggingface/llama-7b": ModelType.PT,
431
- "huggingface/llama-65b": ModelType.PT,
432
- "huggingface/llama-30b": ModelType.PT,
433
- "Henk717/chronoboros-33B": ModelType.IFT,
434
- "jondurbin/airoboros-13b-gpt4-1.4": ModelType.IFT,
435
- "jondurbin/airoboros-7b": ModelType.IFT,
436
- "jondurbin/airoboros-7b-gpt4": ModelType.IFT,
437
- "jondurbin/airoboros-7b-gpt4-1.1": ModelType.IFT,
438
- "jondurbin/airoboros-7b-gpt4-1.2": ModelType.IFT,
439
- "jondurbin/airoboros-7b-gpt4-1.3": ModelType.IFT,
440
- "jondurbin/airoboros-7b-gpt4-1.4": ModelType.IFT,
441
- "jondurbin/airoboros-l2-7b-gpt4-1.4.1": ModelType.IFT,
442
- "jondurbin/airoboros-l2-13b-gpt4-1.4.1": ModelType.IFT,
443
- "jondurbin/airoboros-l2-70b-gpt4-1.4.1": ModelType.IFT,
444
- "jondurbin/airoboros-13b": ModelType.IFT,
445
- "jondurbin/airoboros-33b-gpt4-1.4": ModelType.IFT,
446
- "jondurbin/airoboros-33b-gpt4-1.2": ModelType.IFT,
447
- "jondurbin/airoboros-65b-gpt4-1.2": ModelType.IFT,
448
- "ariellee/SuperPlatty-30B": ModelType.IFT,
449
- "danielhanchen/open_llama_3b_600bt_preview": ModelType.FT,
450
- "cerebras/Cerebras-GPT-256M": ModelType.PT,
451
- "cerebras/Cerebras-GPT-1.3B": ModelType.PT,
452
- "cerebras/Cerebras-GPT-13B": ModelType.PT,
453
- "cerebras/Cerebras-GPT-2.7B": ModelType.PT,
454
- "cerebras/Cerebras-GPT-111M": ModelType.PT,
455
- "cerebras/Cerebras-GPT-6.7B": ModelType.PT,
456
- "Yhyu13/oasst-rlhf-2-llama-30b-7k-steps-hf": ModelType.RL,
457
- "Yhyu13/llama-30B-hf-openassitant": ModelType.FT,
458
- "NousResearch/Nous-Hermes-Llama2-13b": ModelType.IFT,
459
- "NousResearch/Nous-Hermes-llama-2-7b": ModelType.IFT,
460
- "NousResearch/Redmond-Puffin-13B": ModelType.IFT,
461
- "NousResearch/Nous-Hermes-13b": ModelType.IFT,
462
- "project-baize/baize-v2-7b": ModelType.IFT,
463
- "project-baize/baize-v2-13b": ModelType.IFT,
464
- "LLMs/WizardLM-13B-V1.0": ModelType.FT,
465
- "LLMs/AlpacaGPT4-7B-elina": ModelType.FT,
466
- "wenge-research/yayi-7b": ModelType.FT,
467
- "wenge-research/yayi-7b-llama2": ModelType.FT,
468
- "wenge-research/yayi-13b-llama2": ModelType.FT,
469
- "yhyhy3/open_llama_7b_v2_med_instruct": ModelType.IFT,
470
- "llama-anon/instruct-13b": ModelType.IFT,
471
- "huggingtweets/jerma985": ModelType.FT,
472
- "huggingtweets/gladosystem": ModelType.FT,
473
- "huggingtweets/bladeecity-jerma985": ModelType.FT,
474
- "huggyllama/llama-13b": ModelType.PT,
475
- "huggyllama/llama-65b": ModelType.PT,
476
- "FabbriSimo01/Facebook_opt_1.3b_Quantized": ModelType.PT,
477
- "upstage/Llama-2-70b-instruct": ModelType.IFT,
478
- "upstage/Llama-2-70b-instruct-1024": ModelType.IFT,
479
- "upstage/llama-65b-instruct": ModelType.IFT,
480
- "upstage/llama-30b-instruct-2048": ModelType.IFT,
481
- "upstage/llama-30b-instruct": ModelType.IFT,
482
- "WizardLM/WizardLM-13B-1.0": ModelType.IFT,
483
- "WizardLM/WizardLM-13B-V1.1": ModelType.IFT,
484
- "WizardLM/WizardLM-13B-V1.2": ModelType.IFT,
485
- "WizardLM/WizardLM-30B-V1.0": ModelType.IFT,
486
- "WizardLM/WizardCoder-15B-V1.0": ModelType.IFT,
487
- "gpt2": ModelType.PT,
488
- "keyfan/vicuna-chinese-replication-v1.1": ModelType.IFT,
489
- "nthngdy/pythia-owt2-70m-100k": ModelType.FT,
490
- "nthngdy/pythia-owt2-70m-50k": ModelType.FT,
491
- "quantumaikr/KoreanLM-hf": ModelType.FT,
492
- "quantumaikr/open_llama_7b_hf": ModelType.FT,
493
- "quantumaikr/QuantumLM-70B-hf": ModelType.IFT,
494
- "MayaPH/FinOPT-Lincoln": ModelType.FT,
495
- "MayaPH/FinOPT-Franklin": ModelType.FT,
496
- "MayaPH/GodziLLa-30B": ModelType.IFT,
497
- "MayaPH/GodziLLa-30B-plus": ModelType.IFT,
498
- "MayaPH/FinOPT-Washington": ModelType.FT,
499
- "ogimgio/gpt-neo-125m-neurallinguisticpioneers": ModelType.FT,
500
- "layoric/llama-2-13b-code-alpaca": ModelType.FT,
501
- "CobraMamba/mamba-gpt-3b": ModelType.FT,
502
- "CobraMamba/mamba-gpt-3b-v2": ModelType.FT,
503
- "CobraMamba/mamba-gpt-3b-v3": ModelType.FT,
504
- "timdettmers/guanaco-33b-merged": ModelType.FT,
505
- "elinas/chronos-33b": ModelType.IFT,
506
- "heegyu/RedTulu-Uncensored-3B-0719": ModelType.IFT,
507
- "heegyu/WizardVicuna-Uncensored-3B-0719": ModelType.IFT,
508
- "heegyu/WizardVicuna-3B-0719": ModelType.IFT,
509
- "meta-llama/Llama-2-7b-chat-hf": ModelType.RL,
510
- "meta-llama/Llama-2-7b-hf": ModelType.PT,
511
- "meta-llama/Llama-2-13b-chat-hf": ModelType.RL,
512
- "meta-llama/Llama-2-13b-hf": ModelType.PT,
513
- "meta-llama/Llama-2-70b-chat-hf": ModelType.RL,
514
- "meta-llama/Llama-2-70b-hf": ModelType.PT,
515
- "xhyi/PT_GPTNEO350_ATG": ModelType.FT,
516
- "h2oai/h2ogpt-gm-oasst1-en-1024-20b": ModelType.FT,
517
- "h2oai/h2ogpt-gm-oasst1-en-1024-open-llama-7b-preview-400bt": ModelType.FT,
518
- "h2oai/h2ogpt-oig-oasst1-512-6_9b": ModelType.IFT,
519
- "h2oai/h2ogpt-oasst1-512-12b": ModelType.IFT,
520
- "h2oai/h2ogpt-oig-oasst1-256-6_9b": ModelType.IFT,
521
- "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt": ModelType.FT,
522
- "h2oai/h2ogpt-oasst1-512-20b": ModelType.IFT,
523
- "h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2": ModelType.FT,
524
- "h2oai/h2ogpt-gm-oasst1-en-1024-12b": ModelType.FT,
525
- "h2oai/h2ogpt-gm-oasst1-multilang-1024-20b": ModelType.FT,
526
- "bofenghuang/vigogne-13b-instruct": ModelType.IFT,
527
- "bofenghuang/vigogne-13b-chat": ModelType.FT,
528
- "bofenghuang/vigogne-2-7b-instruct": ModelType.IFT,
529
- "bofenghuang/vigogne-7b-instruct": ModelType.IFT,
530
- "bofenghuang/vigogne-7b-chat": ModelType.FT,
531
- "Vmware/open-llama-7b-v2-open-instruct": ModelType.IFT,
532
- "VMware/open-llama-0.7T-7B-open-instruct-v1.1": ModelType.IFT,
533
- "ewof/koishi-instruct-3b": ModelType.IFT,
534
- "gywy/llama2-13b-chinese-v1": ModelType.FT,
535
- "GOAT-AI/GOAT-7B-Community": ModelType.FT,
536
- "psyche/kollama2-7b": ModelType.FT,
537
- "TheTravellingEngineer/llama2-7b-hf-guanaco": ModelType.FT,
538
- "beaugogh/pythia-1.4b-deduped-sharegpt": ModelType.FT,
539
- "augtoma/qCammel-70-x": ModelType.IFT,
540
- "Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_unload": ModelType.IFT,
541
- "anhnv125/pygmalion-6b-roleplay": ModelType.FT,
542
- "64bits/LexPodLM-13B": ModelType.FT,
543
- }
544
-
545
-
546
- def model_type_from_str(type):
547
- if "fine-tuned" in type or "🔶" in type:
548
- return ModelType.FT
549
- if "pretrained" in type or "🟢" in type:
550
- return ModelType.PT
551
- if "RL-tuned" in type or "🟦" in type:
552
- return ModelType.RL
553
- if "instruction-tuned" in type or "⭕" in type:
554
- return ModelType.IFT
555
- return ModelType.Unknown
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{get_model_info/hardocded_metadata/flags.py → leaderboard/filter_models.py} RENAMED
@@ -1,3 +1,6 @@
 
 
 
1
  # Models which have been flagged by users as being problematic for a reason or another
2
  # (Model name to forum discussion link)
3
  FLAGGED_MODELS = {
@@ -16,3 +19,32 @@ FLAGGED_MODELS = {
16
  DO_NOT_SUBMIT_MODELS = [
17
  "Voicelab/trurl-2-13b", # trained on MMLU
18
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.display.formatting import model_hyperlink
2
+ from src.display.utils import AutoEvalColumn
3
+
4
  # Models which have been flagged by users as being problematic for a reason or another
5
  # (Model name to forum discussion link)
6
  FLAGGED_MODELS = {
 
19
  DO_NOT_SUBMIT_MODELS = [
20
  "Voicelab/trurl-2-13b", # trained on MMLU
21
  ]
22
+
23
+
24
+ def flag_models(leaderboard_data: list[dict]):
25
+ for model_data in leaderboard_data:
26
+ if model_data["model_name_for_query"] in FLAGGED_MODELS:
27
+ issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
28
+ issue_link = model_hyperlink(
29
+ FLAGGED_MODELS[model_data["model_name_for_query"]],
30
+ f"See discussion #{issue_num}",
31
+ )
32
+ model_data[
33
+ AutoEvalColumn.model.name
34
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
35
+
36
+
37
+ def remove_forbidden_models(leaderboard_data: list[dict]):
38
+ indices_to_remove = []
39
+ for ix, model in enumerate(leaderboard_data):
40
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
41
+ indices_to_remove.append(ix)
42
+
43
+ for ix in reversed(indices_to_remove):
44
+ leaderboard_data.pop(ix)
45
+ return leaderboard_data
46
+
47
+
48
+ def filter_models(leaderboard_data: list[dict]):
49
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
50
+ flag_models(leaderboard_data)
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import math
4
+ import glob
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List, Tuple
7
+
8
+ import dateutil
9
+ import numpy as np
10
+
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks
12
+ from src.display.formatting import make_clickable_model
13
+ from src.submission.check_validity import is_model_on_hub
14
+
15
+
16
+ @dataclass
17
+ class EvalResult:
18
+ eval_name: str
19
+ full_model: str
20
+ org: str
21
+ model: str
22
+ revision: str
23
+ results: dict
24
+ precision: str = ""
25
+ model_type: ModelType = ModelType.Unknown
26
+ weight_type: str = "Original"
27
+ license: str = "?"
28
+ likes: int = 0
29
+ num_params: int = 0
30
+ date: str = ""
31
+ still_on_hub: bool = False
32
+
33
+ @classmethod
34
+ def init_from_json_file(self, json_filepath):
35
+ with open(json_filepath) as fp:
36
+ data = json.load(fp)
37
+
38
+ # We manage the legacy config format
39
+ config = data.get("config", data.get("config_general", None))
40
+
41
+ # Precision
42
+ precision = config.get("model_dtype")
43
+ if precision == "None":
44
+ precision = "GPTQ"
45
+
46
+ # Get model and org
47
+ org_and_model = config.get("model_name", config.get("model_args", None))
48
+ org_and_model = org_and_model.split("/", 1)
49
+
50
+ if len(org_and_model) == 1:
51
+ org = None
52
+ model = org_and_model[0]
53
+ result_key = f"{model}_{precision}"
54
+ else:
55
+ org = org_and_model[0]
56
+ model = org_and_model[1]
57
+ result_key = f"{org}_{model}_{precision}"
58
+
59
+ still_on_hub = is_model_on_hub("/".join(org_and_model), config.get("model_sha", "main"), trust_remote_code=True)[0]
60
+
61
+ # Extract results available in this file (some results are split in several files)
62
+ results = {}
63
+ for task in Tasks:
64
+ task = task.value
65
+ # We skip old mmlu entries
66
+ wrong_mmlu_version = False
67
+ if task.benchmark == "hendrycksTest":
68
+ for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
69
+ if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
70
+ wrong_mmlu_version = True
71
+
72
+ if wrong_mmlu_version:
73
+ continue
74
+
75
+ # Some truthfulQA values are NaNs
76
+ if task.benchmark == "truthfulqa:mc" and task.benchmark in data["results"]:
77
+ if math.isnan(float(data["results"][task.benchmark][task.metric])):
78
+ results[task.benchmark] = 0.0
79
+ continue
80
+
81
+ # We average all scores of a given metric (mostly for mmlu)
82
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
83
+ if accs.size == 0 or any([acc is None for acc in accs]):
84
+ continue
85
+
86
+ mean_acc = np.mean(accs) * 100.0
87
+ results[task.benchmark] = mean_acc
88
+
89
+ return self(
90
+ eval_name=result_key,
91
+ full_model="/".join(org_and_model),
92
+ org=org,
93
+ model=model,
94
+ results=results,
95
+ precision=precision, # todo model_type=, weight_type=
96
+ revision=config.get("model_sha", ""),
97
+ date=config.get("submission_date", ""),
98
+ still_on_hub=still_on_hub,
99
+ )
100
+
101
+ def update_with_request_file(self):
102
+ request_file = get_request_file_for_model(self.full_model, self.precision)
103
+
104
+ try:
105
+ with open(request_file, "r") as f:
106
+ request = json.load(f)
107
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
108
+ self.license = request.get("license", "?")
109
+ self.likes = request.get("likes", 0)
110
+ self.num_params = request.get("params", 0)
111
+ except Exception:
112
+ print(f"Could not find request file for {self.org}/{self.model}")
113
+
114
+ def to_dict(self):
115
+ average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
116
+ data_dict = {
117
+ "eval_name": self.eval_name, # not a column, just a save name,
118
+ AutoEvalColumn.precision.name: self.precision,
119
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
120
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
121
+ AutoEvalColumn.weight_type.name: self.weight_type,
122
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
123
+ AutoEvalColumn.dummy.name: self.full_model,
124
+ AutoEvalColumn.revision.name: self.revision,
125
+ AutoEvalColumn.average.name: average,
126
+ AutoEvalColumn.license.name: self.license,
127
+ AutoEvalColumn.likes.name: self.likes,
128
+ AutoEvalColumn.params.name: self.num_params,
129
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
130
+ }
131
+
132
+ for task in Tasks:
133
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
134
+
135
+ return data_dict
136
+
137
+
138
+ def get_request_file_for_model(model_name, precision):
139
+ request_files = os.path.join(
140
+ "eval-queue",
141
+ f"{model_name}_eval_request_*.json",
142
+ )
143
+ request_files = glob.glob(request_files)
144
+
145
+ # Select correct request file (precision)
146
+ request_file = ""
147
+ request_files = sorted(request_files, reverse=True)
148
+ for tmp_request_file in request_files:
149
+ with open(tmp_request_file, "r") as f:
150
+ req_content = json.load(f)
151
+ if (
152
+ req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
153
+ and req_content["precision"] == precision.split(".")[-1]
154
+ ):
155
+ request_file = tmp_request_file
156
+ return request_file
157
+
158
+
159
+ def get_eval_results(results_path: str) -> List[EvalResult]:
160
+ json_filepaths = []
161
+
162
+ for root, _, files in os.walk(results_path):
163
+ # We should only have json files in model results
164
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
165
+ continue
166
+
167
+ # Sort the files by date
168
+ try:
169
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
170
+ except dateutil.parser._parser.ParserError:
171
+ files = [files[-1]]
172
+
173
+ # up_to_date = files[-1]
174
+ for file in files:
175
+ json_filepaths.append(os.path.join(root, file))
176
+
177
+ eval_results = {}
178
+ for json_filepath in json_filepaths:
179
+ # Creation of result
180
+ eval_result = EvalResult.init_from_json_file(json_filepath)
181
+ eval_result.update_with_request_file()
182
+
183
+ # Store results of same eval together
184
+ eval_name = eval_result.eval_name
185
+ if eval_name in eval_results.keys():
186
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
187
+ else:
188
+ eval_results[eval_name] = eval_result
189
+
190
+ results = []
191
+ for v in eval_results.values():
192
+ try:
193
+ results.append(v.to_dict())
194
+ except KeyError: # not all eval values present
195
+ continue
196
+
197
+ return results
src/plots/read_results.py DELETED
@@ -1,158 +0,0 @@
1
- import json
2
- import os
3
- from dataclasses import dataclass
4
- from typing import Dict, List, Tuple
5
-
6
- import dateutil
7
- import numpy as np
8
-
9
- from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
10
-
11
- METRICS = ["acc_norm", "acc_norm", "acc", "mc2", "acc", "acc", "f1"]
12
- BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc", "winogrande", "gsm8k", "drop"]
13
- BENCH_TO_NAME = {
14
- "arc:challenge": AutoEvalColumn.arc.name,
15
- "hellaswag": AutoEvalColumn.hellaswag.name,
16
- "hendrycksTest": AutoEvalColumn.mmlu.name,
17
- "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
18
- "winogrande": AutoEvalColumn.winogrande.name,
19
- "gsm8k": AutoEvalColumn.gsm8k.name,
20
- "drop": AutoEvalColumn.drop.name,
21
- }
22
-
23
-
24
- @dataclass
25
- class EvalResult:
26
- eval_name: str
27
- org: str
28
- model: str
29
- revision: str
30
- results: dict
31
- precision: str = ""
32
- model_type: str = ""
33
- weight_type: str = "Original"
34
- date: str = ""
35
-
36
- def to_dict(self):
37
- from src.filters import is_model_on_hub
38
-
39
- if self.org is not None:
40
- base_model = f"{self.org}/{self.model}"
41
- else:
42
- base_model = f"{self.model}"
43
- data_dict = {}
44
-
45
- data_dict["eval_name"] = self.eval_name # not a column, just a save name
46
- data_dict["weight_type"] = self.weight_type # not a column, just a save name
47
- data_dict[AutoEvalColumn.precision.name] = self.precision
48
- data_dict[AutoEvalColumn.model_type.name] = self.model_type
49
- data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
50
- data_dict[AutoEvalColumn.dummy.name] = base_model
51
- data_dict[AutoEvalColumn.revision.name] = self.revision
52
- data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 7.0
53
- data_dict[AutoEvalColumn.still_on_hub.name] = (
54
- is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
55
- )
56
-
57
- for benchmark in BENCHMARKS:
58
- if benchmark not in self.results.keys():
59
- self.results[benchmark] = None
60
-
61
- for k, v in BENCH_TO_NAME.items():
62
- data_dict[v] = self.results[k]
63
-
64
- return data_dict
65
-
66
-
67
- def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
68
- with open(json_filepath) as fp:
69
- data = json.load(fp)
70
-
71
- for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
72
- if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
73
- return None, [] # we skip models with the wrong version
74
-
75
- try:
76
- config = data["config"]
77
- except KeyError:
78
- config = data["config_general"]
79
- model = config.get("model_name", None)
80
- if model is None:
81
- model = config.get("model_args", None)
82
-
83
- model_sha = config.get("model_sha", "")
84
- model_split = model.split("/", 1)
85
-
86
- precision = config.get("model_dtype")
87
- if precision == "None":
88
- precision = "GPTQ"
89
-
90
- model = model_split[-1]
91
-
92
- if len(model_split) == 1:
93
- org = None
94
- model = model_split[0]
95
- result_key = f"{model}_{precision}"
96
- else:
97
- org = model_split[0]
98
- model = model_split[1]
99
- result_key = f"{org}_{model}_{precision}"
100
-
101
- eval_results = []
102
- for benchmark, metric in zip(BENCHMARKS, METRICS):
103
- accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
104
- if accs.size == 0 or any([acc is None for acc in accs]):
105
- continue
106
- mean_acc = np.mean(accs) * 100.0
107
- eval_results.append(
108
- EvalResult(
109
- eval_name=result_key,
110
- org=org,
111
- model=model,
112
- revision=model_sha,
113
- results={benchmark: mean_acc},
114
- precision=precision, # todo model_type=, weight_type=
115
- date=config.get("submission_date"),
116
- )
117
- )
118
-
119
- return result_key, eval_results
120
-
121
-
122
- def get_eval_results(results_path: str) -> List[EvalResult]:
123
- json_filepaths = []
124
-
125
- for root, dir, files in os.walk(results_path):
126
- # We should only have json files in model results
127
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
128
- continue
129
-
130
- # Sort the files by date
131
- # store results by precision maybe?
132
- try:
133
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
134
- except dateutil.parser._parser.ParserError:
135
- files = [files[-1]]
136
-
137
- # up_to_date = files[-1]
138
- for file in files:
139
- json_filepaths.append(os.path.join(root, file))
140
-
141
- eval_results = {}
142
- for json_filepath in json_filepaths:
143
- result_key, results = parse_eval_result(json_filepath)
144
- for eval_result in results:
145
- if result_key in eval_results.keys():
146
- eval_results[result_key].results.update(eval_result.results)
147
- else:
148
- eval_results[result_key] = eval_result
149
-
150
- eval_results = [v for v in eval_results.values()]
151
-
152
- return eval_results
153
-
154
-
155
- def get_eval_results_dicts(results_path: str) -> List[Dict]:
156
- eval_results = get_eval_results(results_path)
157
-
158
- return [e.to_dict() for e in eval_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{load_from_hub.py → populate.py} RENAMED
@@ -1,50 +1,18 @@
1
  import json
2
  import os
3
- from collections import defaultdict
4
 
5
  import pandas as pd
6
 
7
- from src.assets.hardcoded_evals import baseline, gpt4_values, gpt35_values
8
- from src.get_model_info.apply_metadata_to_df import apply_metadata
9
- from src.plots.read_results import get_eval_results_dicts, make_clickable_model
10
- from src.get_model_info.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
11
-
12
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
13
-
14
-
15
- def get_all_requested_models(requested_models_dir: str) -> set[str]:
16
- depth = 1
17
- file_names = []
18
- users_to_submission_dates = defaultdict(list)
19
-
20
- for root, _, files in os.walk(requested_models_dir):
21
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
22
- if current_depth == depth:
23
- for file in files:
24
- if not file.endswith(".json"):
25
- continue
26
- with open(os.path.join(root, file), "r") as f:
27
- info = json.load(f)
28
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
29
-
30
- # Select organisation
31
- if info["model"].count("/") == 0 or "submitted_time" not in info:
32
- continue
33
- organisation, _ = info["model"].split("/")
34
- users_to_submission_dates[organisation].append(info["submitted_time"])
35
-
36
- return set(file_names), users_to_submission_dates
37
 
38
 
39
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
40
- all_data = get_eval_results_dicts(results_path)
41
-
42
- if not IS_PUBLIC:
43
- all_data.append(gpt4_values)
44
- all_data.append(gpt35_values)
45
-
46
- all_data.append(baseline)
47
- apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
48
 
49
  df = pd.DataFrame.from_records(all_data)
50
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
@@ -88,4 +56,3 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
88
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
89
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
90
  return df_finished[cols], df_running[cols], df_pending[cols]
91
-
 
1
  import json
2
  import os
 
3
 
4
  import pandas as pd
5
 
6
+ from src.leaderboard.filter_models import filter_models
7
+ from src.leaderboard.read_evals import get_eval_results
8
+ from src.display.formatting import make_clickable_model, has_no_nan_values
9
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
+ all_data = get_eval_results(results_path)
14
+ all_data.append(baseline_row)
15
+ filter_models(all_data)
 
 
 
 
 
16
 
17
  df = pd.DataFrame.from_records(all_data)
18
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
56
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
  return df_finished[cols], df_running[cols], df_pending[cols]
 
src/{filters.py → submission/check_validity.py} RENAMED
@@ -1,5 +1,9 @@
1
  import huggingface_hub
2
  import os
 
 
 
 
3
  from huggingface_hub import ModelCard
4
  from transformers import AutoConfig
5
 
@@ -30,9 +34,9 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
30
  return True, ""
31
 
32
 
33
- def is_model_on_hub(model_name: str, revision: str, token: str = None) -> bool:
34
  try:
35
- AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False, token=token)
36
  return True, None
37
 
38
  except ValueError:
@@ -45,6 +49,23 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None) -> bool:
45
  return False, "was not found on hub!"
46
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period, rate_limit_quota):
49
  org_or_user, _ = submission_name.split("/")
50
  if org_or_user not in users_to_submission_dates:
@@ -65,3 +86,26 @@ def user_submission_permission(submission_name, users_to_submission_dates, rate_
65
  return False, error_msg
66
  return True, ""
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import huggingface_hub
2
  import os
3
+ import json
4
+ import re
5
+ from collections import defaultdict
6
+ from huggingface_hub.hf_api import ModelInfo
7
  from huggingface_hub import ModelCard
8
  from transformers import AutoConfig
9
 
 
34
  return True, ""
35
 
36
 
37
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
38
  try:
39
+ AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
40
  return True, None
41
 
42
  except ValueError:
 
49
  return False, "was not found on hub!"
50
 
51
 
52
+ def get_model_size(model_info: ModelInfo, precision: str):
53
+ size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
54
+ try:
55
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
56
+ except AttributeError:
57
+ try:
58
+ size_match = re.search(size_pattern, model_info.modelId.lower())
59
+ model_size = size_match.group(0)
60
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
61
+ except AttributeError:
62
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
63
+
64
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
65
+ model_size = size_factor * model_size
66
+ return model_size
67
+
68
+
69
  def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period, rate_limit_quota):
70
  org_or_user, _ = submission_name.split("/")
71
  if org_or_user not in users_to_submission_dates:
 
86
  return False, error_msg
87
  return True, ""
88
 
89
+
90
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
91
+ depth = 1
92
+ file_names = []
93
+ users_to_submission_dates = defaultdict(list)
94
+
95
+ for root, _, files in os.walk(requested_models_dir):
96
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
97
+ if current_depth == depth:
98
+ for file in files:
99
+ if not file.endswith(".json"):
100
+ continue
101
+ with open(os.path.join(root, file), "r") as f:
102
+ info = json.load(f)
103
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
104
+
105
+ # Select organisation
106
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
107
+ continue
108
+ organisation, _ = info["model"].split("/")
109
+ users_to_submission_dates[organisation].append(info["submitted_time"])
110
+
111
+ return set(file_names), users_to_submission_dates
src/submission/submit.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+
3
+ from datetime import datetime, timezone
4
+
5
+ from src.display.formatting import styled_error, styled_warning, styled_message
6
+ from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
7
+ from src.submission.check_validity import (
8
+ user_submission_permission,
9
+ is_model_on_hub,
10
+ get_model_size,
11
+ check_model_card,
12
+ already_submitted_models,
13
+ )
14
+ from src.envs import RATE_LIMIT_QUOTA, RATE_LIMIT_PERIOD, H4_TOKEN, EVAL_REQUESTS_PATH, API, QUEUE_REPO
15
+
16
+ requested_models, users_to_submission_dates = already_submitted_models(EVAL_REQUESTS_PATH)
17
+
18
+
19
+ def add_new_eval(
20
+ model: str,
21
+ base_model: str,
22
+ revision: str,
23
+ precision: str,
24
+ private: bool,
25
+ weight_type: str,
26
+ model_type: str,
27
+ ):
28
+ precision = precision.split(" ")[0]
29
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
30
+
31
+ if model_type is None or model_type == "":
32
+ return styled_error("Please select a model type.")
33
+
34
+ # Is the user rate limited?
35
+ user_can_submit, error_msg = user_submission_permission(
36
+ model, users_to_submission_dates, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
37
+ )
38
+ if not user_can_submit:
39
+ return styled_error(error_msg)
40
+
41
+ # Did the model authors forbid its submission to the leaderboard?
42
+ if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
43
+ return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
44
+
45
+ # Does the model actually exist?
46
+ if revision == "":
47
+ revision = "main"
48
+
49
+ # Is the model on the hub?
50
+ if weight_type in ["Delta", "Adapter"]:
51
+ base_model_on_hub, error = is_model_on_hub(base_model, revision, H4_TOKEN)
52
+ if not base_model_on_hub:
53
+ return styled_error(f'Base model "{base_model}" {error}')
54
+
55
+ if not weight_type == "Adapter":
56
+ model_on_hub, error = is_model_on_hub(model, revision)
57
+ if not model_on_hub:
58
+ return styled_error(f'Model "{model}" {error}')
59
+
60
+ # Is the model info correctly filled?
61
+ try:
62
+ model_info = API.model_info(repo_id=model, revision=revision)
63
+ except Exception:
64
+ return styled_error("Could not get your model information. Please fill it up properly.")
65
+
66
+ model_size = get_model_size(model_info=model_info, precision=precision)
67
+
68
+ # Were the model card and license filled?
69
+ try:
70
+ license = model_info.cardData["license"]
71
+ except Exception:
72
+ return styled_error("Please select a license for your model")
73
+
74
+ modelcard_OK, error_msg = check_model_card(model)
75
+ if not modelcard_OK:
76
+ return styled_error(error_msg)
77
+
78
+ # Seems good, creating the eval
79
+ print("Adding new eval")
80
+
81
+ eval_entry = {
82
+ "model": model,
83
+ "base_model": base_model,
84
+ "revision": revision,
85
+ "private": private,
86
+ "precision": precision,
87
+ "weight_type": weight_type,
88
+ "status": "PENDING",
89
+ "submitted_time": current_time,
90
+ "model_type": model_type,
91
+ "likes": model_info.likes,
92
+ "params": model_size,
93
+ "license": license,
94
+ }
95
+
96
+ user_name = ""
97
+ model_path = model
98
+ if "/" in model:
99
+ user_name = model.split("/")[0]
100
+ model_path = model.split("/")[1]
101
+
102
+ print("Creating eval file")
103
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
104
+ os.makedirs(OUT_DIR, exist_ok=True)
105
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
106
+
107
+ # Check for duplicate submission
108
+ if f"{model}_{revision}_{precision}" in requested_models:
109
+ return styled_warning("This model has been already submitted.")
110
+
111
+ with open(out_path, "w") as f:
112
+ f.write(json.dumps(eval_entry))
113
+
114
+ print("Uploading eval file")
115
+ API.upload_file(
116
+ path_or_fileobj=out_path,
117
+ path_in_repo=out_path.split("eval-queue/")[1],
118
+ repo_id=QUEUE_REPO,
119
+ repo_type="dataset",
120
+ commit_message=f"Add {model} to eval queue",
121
+ )
122
+
123
+ # Remove the local file
124
+ os.remove(out_path)
125
+
126
+ return styled_message(
127
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
128
+ )
src/{manage_collections.py → tools/collections.py} RENAMED
@@ -4,33 +4,34 @@ from pandas import DataFrame
4
  from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
5
  from huggingface_hub.utils._errors import HfHubHTTPError
6
 
7
- from src.get_model_info.hardocded_metadata.types import ModelType
8
- from src.get_model_info.utils import AutoEvalColumn
9
 
10
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
11
 
12
- path_to_collection = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
13
  intervals = {
14
  "1B": pd.Interval(0, 1.5, closed="right"),
15
  "3B": pd.Interval(2.5, 3.5, closed="neither"),
16
  "7B": pd.Interval(6, 8, closed="neither"),
17
  "13B": pd.Interval(10, 14, closed="neither"),
18
- "30B":pd.Interval(25, 35, closed="neither"),
19
  "65B": pd.Interval(60, 70, closed="neither"),
20
  }
21
 
 
22
  def update_collections(df: DataFrame):
23
- """This function updates the Open LLM Leaderboard model collection with the latest best models for
24
  each size category and type.
25
  """
26
- collection = get_collection(collection_slug=path_to_collection, token=H4_TOKEN)
27
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
28
 
29
  cur_best_models = []
30
 
31
  ix = 0
32
  for type in ModelType:
33
- if type.value.name == "": continue
 
34
  for size in intervals:
35
  # We filter the df to gather the relevant models
36
  type_emoji = [t[0] for t in type.value.symbol]
@@ -40,7 +41,9 @@ def update_collections(df: DataFrame):
40
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
41
  filtered_df = filtered_df.loc[mask]
42
 
43
- best_models = list(filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.dummy.name])
 
 
44
  print(type.value.symbol, size, best_models[:10])
45
 
46
  # We add them one by one to the leaderboard
@@ -49,27 +52,32 @@ def update_collections(df: DataFrame):
49
  cur_len_collection = len(collection.items)
50
  try:
51
  collection = add_collection_item(
52
- path_to_collection,
53
- item_id=model,
54
- item_type="model",
55
  exists_ok=True,
56
- note=f"Best {type.to_str(' ')} model of around {size} on the leaderboard today!",
57
- token=H4_TOKEN
58
  )
59
- if len(collection.items) > cur_len_collection: # we added an item - we make sure its position is correct
60
- item_object_id = collection.items[-1].item_object_id
61
- update_collection_item(collection_slug=path_to_collection, item_object_id=item_object_id, position=ix)
 
 
 
 
62
  cur_len_collection = len(collection.items)
63
  cur_best_models.append(model)
64
  break
65
  except HfHubHTTPError:
66
  continue
67
 
68
- collection = get_collection(path_to_collection, token=H4_TOKEN)
69
  for item in collection.items:
70
  if item.item_id not in cur_best_models:
71
  try:
72
- delete_collection_item(collection_slug=path_to_collection, item_object_id=item.item_object_id, token=H4_TOKEN)
 
 
73
  except HfHubHTTPError:
74
  continue
75
-
 
4
  from huggingface_hub import get_collection, add_collection_item, update_collection_item, delete_collection_item
5
  from huggingface_hub.utils._errors import HfHubHTTPError
6
 
7
+ from src.display.utils import AutoEvalColumn, ModelType
 
8
 
9
+ from src.envs import H4_TOKEN, PATH_TO_COLLECTION
10
 
11
+ # Specific intervals for the collections
12
  intervals = {
13
  "1B": pd.Interval(0, 1.5, closed="right"),
14
  "3B": pd.Interval(2.5, 3.5, closed="neither"),
15
  "7B": pd.Interval(6, 8, closed="neither"),
16
  "13B": pd.Interval(10, 14, closed="neither"),
17
+ "30B": pd.Interval(25, 35, closed="neither"),
18
  "65B": pd.Interval(60, 70, closed="neither"),
19
  }
20
 
21
+
22
  def update_collections(df: DataFrame):
23
+ """This function updates the Open LLM Leaderboard model collection with the latest best models for
24
  each size category and type.
25
  """
26
+ collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
27
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
28
 
29
  cur_best_models = []
30
 
31
  ix = 0
32
  for type in ModelType:
33
+ if type.value.name == "":
34
+ continue
35
  for size in intervals:
36
  # We filter the df to gather the relevant models
37
  type_emoji = [t[0] for t in type.value.symbol]
 
41
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
42
  filtered_df = filtered_df.loc[mask]
43
 
44
+ best_models = list(
45
+ filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.dummy.name]
46
+ )
47
  print(type.value.symbol, size, best_models[:10])
48
 
49
  # We add them one by one to the leaderboard
 
52
  cur_len_collection = len(collection.items)
53
  try:
54
  collection = add_collection_item(
55
+ PATH_TO_COLLECTION,
56
+ item_id=model,
57
+ item_type="model",
58
  exists_ok=True,
59
+ note=f"Best {type.to_str(' ')} model of around {size} on the leaderboard today!",
60
+ token=H4_TOKEN,
61
  )
62
+ if (
63
+ len(collection.items) > cur_len_collection
64
+ ): # we added an item - we make sure its position is correct
65
+ item_object_id = collection.items[-1].item_object_id
66
+ update_collection_item(
67
+ collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix
68
+ )
69
  cur_len_collection = len(collection.items)
70
  cur_best_models.append(model)
71
  break
72
  except HfHubHTTPError:
73
  continue
74
 
75
+ collection = get_collection(PATH_TO_COLLECTION, token=H4_TOKEN)
76
  for item in collection.items:
77
  if item.item_id not in cur_best_models:
78
  try:
79
+ delete_collection_item(
80
+ collection_slug=PATH_TO_COLLECTION, item_object_id=item.item_object_id, token=H4_TOKEN
81
+ )
82
  except HfHubHTTPError:
83
  continue
 
models_backlinks.py → src/tools/model_backlinks.py RENAMED
File without changes
src/{plots/plot_results.py → tools/plots.py} RENAMED
@@ -4,7 +4,7 @@ from plotly.graph_objs import Figure
4
  import pickle
5
  from datetime import datetime, timezone
6
  from typing import List, Dict, Tuple, Any
7
- from src.get_model_info.hardocded_metadata.flags import FLAGGED_MODELS
8
 
9
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
10
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
@@ -220,4 +220,4 @@ def create_metric_plot_obj(
220
 
221
  # Example Usage:
222
  # human_baselines dictionary is defined.
223
- # chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")
 
4
  import pickle
5
  from datetime import datetime, timezone
6
  from typing import List, Dict, Tuple, Any
7
+ from src.leaderboard.filter_models import FLAGGED_MODELS
8
 
9
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
10
  # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
 
220
 
221
  # Example Usage:
222
  # human_baselines dictionary is defined.
223
+ # chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")