Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -18,7 +18,7 @@ def make_default_md(arena_df, elo_results):
|
|
18 |
|
19 |
leaderboard_md = f"""
|
20 |
# NeurIPS LLM Merging Competition Leaderboard
|
21 |
-
[Website]() | [
|
22 |
|
23 |
"""
|
24 |
return leaderboard_md
|
@@ -223,7 +223,6 @@ def recompute_final_ranking(arena_df):
|
|
223 |
|
224 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
225 |
arena_df = arena_df.sort_values(by=["final_ranking", "rating"], ascending=[True, False])
|
226 |
-
arena_df = arena_df[arena_df["num_battles"] > 2000]
|
227 |
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
228 |
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
229 |
|
@@ -234,7 +233,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
234 |
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
|
235 |
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
|
236 |
# arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
|
237 |
-
# arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 500]
|
238 |
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
|
239 |
# keep only the models in the subset in arena_df and recompute final_ranking
|
240 |
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
|
@@ -248,10 +246,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
248 |
arena_df = arena_subset_df.join(arena_df["final_ranking"], rsuffix="_global", how="inner")
|
249 |
arena_df["ranking_difference"] = arena_df["final_ranking_global"] - arena_df["final_ranking"]
|
250 |
|
251 |
-
# no tie version
|
252 |
-
# arena_df = arena_subset_df.join(arena_df["final_ranking_no_tie"], rsuffix="_global", how="inner")
|
253 |
-
# arena_df["ranking_difference"] = arena_df["final_ranking_no_tie_global"] - arena_df["final_ranking_no_tie"]
|
254 |
-
|
255 |
arena_df = arena_df.sort_values(by=["final_ranking", "rating"], ascending=[True, False])
|
256 |
arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
|
257 |
|
@@ -272,15 +266,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
272 |
row.append(model_name)
|
273 |
# elo rating
|
274 |
row.append(round(arena_df.iloc[i]["rating"]))
|
275 |
-
upper_diff = round(
|
276 |
-
arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
|
277 |
-
)
|
278 |
-
lower_diff = round(
|
279 |
-
arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
|
280 |
-
)
|
281 |
-
row.append(f"+{upper_diff}/-{lower_diff}")
|
282 |
-
# num battles
|
283 |
-
row.append(round(arena_df.iloc[i]["num_battles"]))
|
284 |
# Organization
|
285 |
row.append(
|
286 |
model_table_df[model_table_df["key"] == model_key]["Organization"].values[0]
|
@@ -289,11 +274,6 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
289 |
row.append(
|
290 |
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
|
291 |
)
|
292 |
-
cutoff_date = model_table_df[model_table_df["key"] == model_key]["Knowledge cutoff date"].values[0]
|
293 |
-
if cutoff_date == "-":
|
294 |
-
row.append("Unknown")
|
295 |
-
else:
|
296 |
-
row.append(cutoff_date)
|
297 |
values.append(row)
|
298 |
except Exception as e:
|
299 |
print(f"{model_key} - {e}")
|
@@ -301,23 +281,9 @@ def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
|
301 |
|
302 |
key_to_category_name = {
|
303 |
"full": "Overall",
|
304 |
-
"coding": "Coding",
|
305 |
-
"long_user": "Longer Query",
|
306 |
-
"english": "English",
|
307 |
-
"chinese": "Chinese",
|
308 |
-
"french": "French",
|
309 |
-
"no_tie": "Exclude Ties",
|
310 |
-
"no_short": "Exclude Short",
|
311 |
}
|
312 |
cat_name_to_explanation = {
|
313 |
"Overall": "Overall Questions",
|
314 |
-
"Coding": "Coding: whether conversation contains code snippets",
|
315 |
-
"Longer Query": "Longer Query (>= 500 tokens)",
|
316 |
-
"English": "English Prompts",
|
317 |
-
"Chinese": "Chinese Prompts",
|
318 |
-
"French": "French Prompts",
|
319 |
-
"Exclude Ties": "Exclude Ties and Bothbad",
|
320 |
-
"Exclude Short": "User Query >= 5 tokens",
|
321 |
}
|
322 |
|
323 |
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
@@ -364,7 +330,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
364 |
"⭐ Arena Elo",
|
365 |
"Organization",
|
366 |
"License",
|
367 |
-
"Knowledge Cutoff",
|
368 |
],
|
369 |
datatype=[
|
370 |
"number",
|
@@ -372,7 +337,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
372 |
"number",
|
373 |
"str",
|
374 |
"str",
|
375 |
-
"str",
|
376 |
],
|
377 |
value=arena_table_vals,
|
378 |
elem_id="arena_leaderboard_dataframe",
|
@@ -419,7 +383,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
419 |
pass
|
420 |
|
421 |
def update_leaderboard_df(arena_table_vals):
|
422 |
-
elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "🤖 Model", "⭐ Arena Elo", "Organization", "License"
|
423 |
|
424 |
# goal: color the rows based on the rank with styler
|
425 |
def highlight_max(s):
|
@@ -446,7 +410,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
446 |
"⭐ Arena Elo",
|
447 |
"Organization",
|
448 |
"License",
|
449 |
-
"Knowledge Cutoff",
|
450 |
],
|
451 |
datatype=[
|
452 |
"number",
|
@@ -454,7 +417,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
454 |
"number",
|
455 |
"str",
|
456 |
"str",
|
457 |
-
"str",
|
458 |
],
|
459 |
value=arena_values,
|
460 |
elem_id="arena_leaderboard_dataframe",
|
@@ -470,7 +432,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
470 |
"⭐ Arena Elo",
|
471 |
"Organization",
|
472 |
"License",
|
473 |
-
"Knowledge Cutoff",
|
474 |
],
|
475 |
datatype=[
|
476 |
"number",
|
@@ -478,7 +439,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa
|
|
478 |
"number",
|
479 |
"str",
|
480 |
"str",
|
481 |
-
"str",
|
482 |
],
|
483 |
value=arena_values,
|
484 |
elem_id="arena_leaderboard_dataframe",
|
|
|
18 |
|
19 |
leaderboard_md = f"""
|
20 |
# NeurIPS LLM Merging Competition Leaderboard
|
21 |
+
[Website](https://llm-merging.github.io/index) | [Starter Kit (Github)]() | [Discord](https://discord.com/invite/dPBHEVnV) |
|
22 |
|
23 |
"""
|
24 |
return leaderboard_md
|
|
|
223 |
|
224 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
|
225 |
arena_df = arena_df.sort_values(by=["final_ranking", "rating"], ascending=[True, False])
|
|
|
226 |
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
227 |
arena_df = arena_df.sort_values(by=["final_ranking"], ascending=True)
|
228 |
|
|
|
233 |
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
|
234 |
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
|
235 |
# arena_subset_df = arena_subset_df.sort_values(by=["final_ranking"], ascending=True)
|
|
|
236 |
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
|
237 |
# keep only the models in the subset in arena_df and recompute final_ranking
|
238 |
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
|
|
|
246 |
arena_df = arena_subset_df.join(arena_df["final_ranking"], rsuffix="_global", how="inner")
|
247 |
arena_df["ranking_difference"] = arena_df["final_ranking_global"] - arena_df["final_ranking"]
|
248 |
|
|
|
|
|
|
|
|
|
249 |
arena_df = arena_df.sort_values(by=["final_ranking", "rating"], ascending=[True, False])
|
250 |
arena_df["final_ranking"] = arena_df.apply(lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1)
|
251 |
|
|
|
266 |
row.append(model_name)
|
267 |
# elo rating
|
268 |
row.append(round(arena_df.iloc[i]["rating"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
# Organization
|
270 |
row.append(
|
271 |
model_table_df[model_table_df["key"] == model_key]["Organization"].values[0]
|
|
|
274 |
row.append(
|
275 |
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
|
276 |
)
|
|
|
|
|
|
|
|
|
|
|
277 |
values.append(row)
|
278 |
except Exception as e:
|
279 |
print(f"{model_key} - {e}")
|
|
|
281 |
|
282 |
key_to_category_name = {
|
283 |
"full": "Overall",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
}
|
285 |
cat_name_to_explanation = {
|
286 |
"Overall": "Overall Questions",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
}
|
288 |
|
289 |
def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=False):
|
|
|
330 |
"⭐ Arena Elo",
|
331 |
"Organization",
|
332 |
"License",
|
|
|
333 |
],
|
334 |
datatype=[
|
335 |
"number",
|
|
|
337 |
"number",
|
338 |
"str",
|
339 |
"str",
|
|
|
340 |
],
|
341 |
value=arena_table_vals,
|
342 |
elem_id="arena_leaderboard_dataframe",
|
|
|
383 |
pass
|
384 |
|
385 |
def update_leaderboard_df(arena_table_vals):
|
386 |
+
elo_datarame = pd.DataFrame(arena_table_vals, columns=[ "Rank", "🤖 Model", "⭐ Arena Elo", "Organization", "License"])
|
387 |
|
388 |
# goal: color the rows based on the rank with styler
|
389 |
def highlight_max(s):
|
|
|
410 |
"⭐ Arena Elo",
|
411 |
"Organization",
|
412 |
"License",
|
|
|
413 |
],
|
414 |
datatype=[
|
415 |
"number",
|
|
|
417 |
"number",
|
418 |
"str",
|
419 |
"str",
|
|
|
420 |
],
|
421 |
value=arena_values,
|
422 |
elem_id="arena_leaderboard_dataframe",
|
|
|
432 |
"⭐ Arena Elo",
|
433 |
"Organization",
|
434 |
"License",
|
|
|
435 |
],
|
436 |
datatype=[
|
437 |
"number",
|
|
|
439 |
"number",
|
440 |
"str",
|
441 |
"str",
|
|
|
442 |
],
|
443 |
value=arena_values,
|
444 |
elem_id="arena_leaderboard_dataframe",
|