Spaces:
Runtime error
Runtime error
rank leaderboard by mean task rank (MTR) instead of average across datasets. add 'export to json' button to make it easier for people to access the underlying leaderboard data.
Browse files
app.py
CHANGED
@@ -160,8 +160,22 @@ def add_rank(df):
|
|
160 |
if len(cols_to_rank) == 1:
|
161 |
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
162 |
else:
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
df.insert(0, "Rank", list(range(1, len(df) + 1)))
|
166 |
df = df.round(2)
|
167 |
# Fill NaN after averaging
|
@@ -295,11 +309,33 @@ def get_mteb_average(task_dict: dict, refresh=True):
|
|
295 |
)
|
296 |
# Debugging:
|
297 |
# DATA_OVERALL.to_csv("overall.csv")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
-
DATA_OVERALL.insert(
|
300 |
for i, (task_category, task_category_list) in enumerate(task_dict.items()):
|
301 |
-
DATA_OVERALL.insert(
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
# Start ranking from 1
|
304 |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
|
305 |
|
@@ -307,14 +343,28 @@ def get_mteb_average(task_dict: dict, refresh=True):
|
|
307 |
|
308 |
DATA_TASKS = {}
|
309 |
for task_category, task_category_list in task_dict.items():
|
310 |
-
DATA_TASKS[task_category] = add_rank(
|
|
|
|
|
311 |
DATA_TASKS[task_category] = DATA_TASKS[task_category][DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)]
|
312 |
|
313 |
# Fill NaN after averaging
|
314 |
DATA_OVERALL.fillna("", inplace=True)
|
315 |
|
316 |
-
data_overall_rows = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
for task_category, task_category_list in task_dict.items():
|
|
|
|
|
|
|
318 |
data_overall_rows.append(f"{task_category} Average ({len(task_category_list)} datasets)")
|
319 |
|
320 |
DATA_OVERALL = DATA_OVERALL[data_overall_rows]
|
@@ -341,6 +391,30 @@ for board, board_config in BOARDS_CONFIG.items():
|
|
341 |
boards_data[board]["data_tasks"][task_category] = data_task_category
|
342 |
all_data_tasks.append(data_task_category)
|
343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
# Exact, add all non-nan integer values for every dataset
|
345 |
NUM_SCORES = 0
|
346 |
DATASETS = []
|
@@ -392,12 +466,12 @@ Each inner tab can have the following keys:
|
|
392 |
- refresh: The function to refresh the leaderboard
|
393 |
"""
|
394 |
|
395 |
-
def get_refresh_function(task_category, task_list):
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
|
402 |
data = {
|
403 |
"Overall": {"metric": "Various, refer to task tabs", "data": []}
|
@@ -425,7 +499,7 @@ for board, board_config in BOARDS_CONFIG.items():
|
|
425 |
"language_long": board_config["language_long"],
|
426 |
"description": f"**Overall MTEB {overall_pretty_name}** 🔮{board_icon}",
|
427 |
"data": boards_data[board]["data_overall"],
|
428 |
-
"refresh": lambda: get_mteb_average(board_config["tasks"])[0],#partial(get_mteb_average, board_config["tasks"]),
|
429 |
"credits": credits,
|
430 |
})
|
431 |
for task_category, task_category_list in board_config["tasks"].items():
|
@@ -437,7 +511,7 @@ for board, board_config in BOARDS_CONFIG.items():
|
|
437 |
"language_long": board_config["language_long"],
|
438 |
"description": f"**{task_category} {board_pretty_name}** {task_icon}{board_icon}",
|
439 |
"data": boards_data[board]["data_tasks"][task_category],
|
440 |
-
"refresh": get_refresh_function(task_category, task_category_list),
|
441 |
"credits": credits,
|
442 |
})
|
443 |
|
@@ -567,6 +641,10 @@ with gr.Blocks(css=css) as block:
|
|
567 |
elem_classes=["filter-checkbox-group"],
|
568 |
scale=2,
|
569 |
)
|
|
|
|
|
|
|
|
|
570 |
|
571 |
with gr.Tabs() as outer_tabs:
|
572 |
# Store the tabs for updating them on load based on URL parameters
|
@@ -611,9 +689,9 @@ with gr.Blocks(css=css) as block:
|
|
611 |
full_dataframe = gr.Dataframe(item["data"], datatype=datatype, type="pandas", visible=False)
|
612 |
full_dataframes.append(full_dataframe)
|
613 |
|
614 |
-
with gr.Row():
|
615 |
-
|
616 |
-
|
617 |
|
618 |
gr.Markdown(f"""
|
619 |
- **Total Datasets**: {NUM_DATASETS}
|
|
|
160 |
if len(cols_to_rank) == 1:
|
161 |
df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
|
162 |
else:
|
163 |
+
ranking = df[cols_to_rank].rank(
|
164 |
+
ascending=False,
|
165 |
+
axis=0,
|
166 |
+
method="average",
|
167 |
+
na_option="keep",
|
168 |
+
pct=True
|
169 |
+
)
|
170 |
+
mtr = ranking.mean(axis=1, skipna=False)
|
171 |
+
insert_at = len(df.columns) - len(cols_to_rank)
|
172 |
+
df.insert(insert_at, "MTR%", mtr)
|
173 |
+
df.insert(
|
174 |
+
insert_at+1,
|
175 |
+
"Average",
|
176 |
+
df[cols_to_rank].mean(axis=1, skipna=False)
|
177 |
+
)
|
178 |
+
df.sort_values("MTR%", ascending=True, inplace=True)
|
179 |
df.insert(0, "Rank", list(range(1, len(df) + 1)))
|
180 |
df = df.round(2)
|
181 |
# Fill NaN after averaging
|
|
|
309 |
)
|
310 |
# Debugging:
|
311 |
# DATA_OVERALL.to_csv("overall.csv")
|
312 |
+
|
313 |
+
#
|
314 |
+
# Compute overall MTR
|
315 |
+
#
|
316 |
+
mtr_column = f"MTR% ({len(all_tasks)} datasets)"
|
317 |
+
task_ranks = DATA_OVERALL[all_tasks].rank(
|
318 |
+
ascending=False, axis=0, method="average", na_option="keep", pct=True
|
319 |
+
)
|
320 |
+
mean_task_rank = task_ranks.mean(axis=1, skipna=False)
|
321 |
+
DATA_OVERALL.insert(1, mtr_column, mean_task_rank)
|
322 |
|
323 |
+
DATA_OVERALL.insert(2, f"Average ({len(all_tasks)} datasets)", DATA_OVERALL[all_tasks].mean(axis=1, skipna=False))
|
324 |
for i, (task_category, task_category_list) in enumerate(task_dict.items()):
|
325 |
+
DATA_OVERALL.insert(
|
326 |
+
i+3,
|
327 |
+
f"{task_category} MTR% ({len(task_category_list)} datasets)",
|
328 |
+
task_ranks[task_category_list].mean(axis=1, skipna=False)
|
329 |
+
)
|
330 |
+
DATA_OVERALL.insert(
|
331 |
+
i+4,
|
332 |
+
f"{task_category} Average ({len(task_category_list)} datasets)",
|
333 |
+
DATA_OVERALL[task_category_list].mean(axis=1, skipna=False)
|
334 |
+
)
|
335 |
+
|
336 |
+
|
337 |
+
# sort by MTR in ascending order: lower is better for ranks
|
338 |
+
DATA_OVERALL.sort_values(mtr_column, ascending=True, inplace=True)
|
339 |
# Start ranking from 1
|
340 |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
|
341 |
|
|
|
343 |
|
344 |
DATA_TASKS = {}
|
345 |
for task_category, task_category_list in task_dict.items():
|
346 |
+
DATA_TASKS[task_category] = add_rank(
|
347 |
+
DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + task_category_list]
|
348 |
+
)
|
349 |
DATA_TASKS[task_category] = DATA_TASKS[task_category][DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)]
|
350 |
|
351 |
# Fill NaN after averaging
|
352 |
DATA_OVERALL.fillna("", inplace=True)
|
353 |
|
354 |
+
data_overall_rows = [
|
355 |
+
"Rank",
|
356 |
+
"Model",
|
357 |
+
"Model Size (Million Parameters)",
|
358 |
+
"Memory Usage (GB, fp32)",
|
359 |
+
"Embedding Dimensions",
|
360 |
+
"Max Tokens",
|
361 |
+
mtr_column,
|
362 |
+
f"Average ({len(all_tasks)} datasets)"
|
363 |
+
]
|
364 |
for task_category, task_category_list in task_dict.items():
|
365 |
+
data_overall_rows.append(
|
366 |
+
f"{task_category} MTR% ({len(task_category_list)} datasets)"
|
367 |
+
)
|
368 |
data_overall_rows.append(f"{task_category} Average ({len(task_category_list)} datasets)")
|
369 |
|
370 |
DATA_OVERALL = DATA_OVERALL[data_overall_rows]
|
|
|
391 |
boards_data[board]["data_tasks"][task_category] = data_task_category
|
392 |
all_data_tasks.append(data_task_category)
|
393 |
|
394 |
+
|
395 |
+
EXPORTED_DATA_FILEPATH = "boards_data.json"
|
396 |
+
def export_to_json():
|
397 |
+
# TODO: make this play nicely with the 'refresh' functionality? (currently
|
398 |
+
# disabled)
|
399 |
+
export = {}
|
400 |
+
for board, board_config in BOARDS_CONFIG.items():
|
401 |
+
exp = {"data_overall": None, "data_tasks": {}}
|
402 |
+
data = boards_data[board]
|
403 |
+
data_overall = data.get("data_overall", None)
|
404 |
+
if data_overall is not None:
|
405 |
+
assert isinstance(data_overall, pd.DataFrame), f"data_overall not a df, was {type(data_overall)}"
|
406 |
+
exp["data_overall"] = data_overall.to_dict(orient="records")
|
407 |
+
for task_category, task_category_list in board_config["tasks"].items():
|
408 |
+
task_df = data["data_tasks"][task_category]
|
409 |
+
assert isinstance(task_df, pd.DataFrame), f"task data not a df, was {type(task_df)}"
|
410 |
+
exp["data_tasks"][task_category] = task_df.to_dict(orient="records")
|
411 |
+
export[board] = exp
|
412 |
+
with open(EXPORTED_DATA_FILEPATH, "w") as fout:
|
413 |
+
fout.write(json.dumps(export, indent=4))
|
414 |
+
|
415 |
+
|
416 |
+
export_to_json()
|
417 |
+
|
418 |
# Exact, add all non-nan integer values for every dataset
|
419 |
NUM_SCORES = 0
|
420 |
DATASETS = []
|
|
|
466 |
- refresh: The function to refresh the leaderboard
|
467 |
"""
|
468 |
|
469 |
+
# def get_refresh_function(task_category, task_list):
|
470 |
+
# def _refresh():
|
471 |
+
# data_task_category = get_mteb_data(tasks=[task_category], datasets=task_list)
|
472 |
+
# data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True)
|
473 |
+
# return data_task_category
|
474 |
+
# return _refresh
|
475 |
|
476 |
data = {
|
477 |
"Overall": {"metric": "Various, refer to task tabs", "data": []}
|
|
|
499 |
"language_long": board_config["language_long"],
|
500 |
"description": f"**Overall MTEB {overall_pretty_name}** 🔮{board_icon}",
|
501 |
"data": boards_data[board]["data_overall"],
|
502 |
+
# "refresh": lambda: get_mteb_average(board_config["tasks"])[0],#partial(get_mteb_average, board_config["tasks"]),
|
503 |
"credits": credits,
|
504 |
})
|
505 |
for task_category, task_category_list in board_config["tasks"].items():
|
|
|
511 |
"language_long": board_config["language_long"],
|
512 |
"description": f"**{task_category} {board_pretty_name}** {task_icon}{board_icon}",
|
513 |
"data": boards_data[board]["data_tasks"][task_category],
|
514 |
+
# "refresh": get_refresh_function(task_category, task_category_list),
|
515 |
"credits": credits,
|
516 |
})
|
517 |
|
|
|
641 |
elem_classes=["filter-checkbox-group"],
|
642 |
scale=2,
|
643 |
)
|
644 |
+
export_button = gr.DownloadButton(
|
645 |
+
label="Download as JSON",
|
646 |
+
value=EXPORTED_DATA_FILEPATH,
|
647 |
+
)
|
648 |
|
649 |
with gr.Tabs() as outer_tabs:
|
650 |
# Store the tabs for updating them on load based on URL parameters
|
|
|
689 |
full_dataframe = gr.Dataframe(item["data"], datatype=datatype, type="pandas", visible=False)
|
690 |
full_dataframes.append(full_dataframe)
|
691 |
|
692 |
+
# with gr.Row():
|
693 |
+
# refresh_button = gr.Button("Refresh")
|
694 |
+
# refresh_button.click(item["refresh"], inputs=None, outputs=dataframe, concurrency_limit=20)
|
695 |
|
696 |
gr.Markdown(f"""
|
697 |
- **Total Datasets**: {NUM_DATASETS}
|