IL-TUR-Leaderboard

Running

App Files Files Community

Clémentine commited on Nov 21, 2023

Commit

314f91a

•

1 Parent(s): 1257fc3

fixs

Browse files

Files changed (5) hide show

src/display/about.py +2 -2
src/display/utils.py +1 -0
src/leaderboard/filter_models.py +0 -50
src/populate.py +1 -4
src/submission/submit.py +2 -2

src/display/about.py CHANGED Viewed

@@ -1,6 +1,5 @@
-from src.display.utils import ModelType
-from enum import Enum
 from dataclasses import dataclass
 @dataclass
 class Task:
@@ -8,6 +7,7 @@ class Task:
     metric: str
     col_name: str
 # Init: to update with your specific keys
 class Tasks(Enum):
     task0 = Task("Key in the harness", "metric in the harness", "Display name 1")

 from dataclasses import dataclass
+from enum import Enum
 @dataclass
 class Task:
     metric: str
     col_name: str
 # Init: to update with your specific keys
 class Tasks(Enum):
     task0 = Task("Key in the harness", "metric in the harness", "Display name 1")

src/display/utils.py CHANGED Viewed

@@ -8,6 +8,7 @@ from src.display.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
 # when a modif is needed

 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
 # when a modif is needed

src/leaderboard/filter_models.py DELETED Viewed

@@ -1,50 +0,0 @@
-from src.display.formatting import model_hyperlink
-from src.display.utils import AutoEvalColumn
-# Models which have been flagged by users as being problematic for a reason or another
-# (Model name to forum discussion link)
-FLAGGED_MODELS = {
-    "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
-    "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
-    "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
-    "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
-    "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
-    "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
-    "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
-    "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
-    "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
-}
-# Models which have been requested by orgs to not be submitted on the leaderboard
-DO_NOT_SUBMIT_MODELS = [
-    "Voicelab/trurl-2-13b",  # trained on MMLU
-]
-def flag_models(leaderboard_data: list[dict]):
-    for model_data in leaderboard_data:
-        if model_data["model_name_for_query"] in FLAGGED_MODELS:
-            issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
-            issue_link = model_hyperlink(
-                FLAGGED_MODELS[model_data["model_name_for_query"]],
-                f"See discussion #{issue_num}",
-            )
-            model_data[
-                AutoEvalColumn.model.name
-            ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
-def remove_forbidden_models(leaderboard_data: list[dict]):
-    indices_to_remove = []
-    for ix, model in enumerate(leaderboard_data):
-        if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
-            indices_to_remove.append(ix)
-    for ix in reversed(indices_to_remove):
-        leaderboard_data.pop(ix)
-    return leaderboard_data
-def filter_models(leaderboard_data: list[dict]):
-    leaderboard_data = remove_forbidden_models(leaderboard_data)
-    flag_models(leaderboard_data)

src/populate.py CHANGED Viewed

@@ -4,16 +4,13 @@ import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
-from src.leaderboard.filter_models import filter_models
 from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
-    all_data_json.append(baseline_row)
-    filter_models(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)

 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)

src/submission/submit.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
@@ -45,7 +45,7 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')

 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')