g8a9 commited on
Commit
e3d6a90
1 Parent(s): 1c41f75
Files changed (4) hide show
  1. app.py +2 -2
  2. src/about.py +8 -3
  3. src/leaderboard/read_evals.py +16 -7
  4. src/populate.py +34 -34
app.py CHANGED
@@ -27,7 +27,7 @@ from src.display.utils import (
27
  Precision,
28
  )
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
32
 
33
 
@@ -49,7 +49,7 @@ def restart_space():
49
  # restart_space()
50
 
51
  try:
52
- print(EVAL_RESULTS_PATH)
53
  snapshot_download(
54
  repo_id=RESULTS_REPO,
55
  local_dir=EVAL_RESULTS_PATH,
 
27
  Precision,
28
  )
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
+ from src.populate import get_leaderboard_df
31
  from src.submission.submit import add_new_eval
32
 
33
 
 
49
  # restart_space()
50
 
51
  try:
52
+ print("Saving results locally at:", EVAL_RESULTS_PATH)
53
  snapshot_download(
54
  repo_id=RESULTS_REPO,
55
  local_dir=EVAL_RESULTS_PATH,
src/about.py CHANGED
@@ -15,18 +15,23 @@ class Task:
15
  # ---------------------------------------------------
16
  class Tasks(Enum):
17
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
18
- task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
19
  task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
20
  task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
21
- task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
22
  task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
 
 
 
23
  task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
24
  task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
 
 
25
  task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
26
  task8 = Task("news_sum", "bertscore,none", "News Sum")
 
27
  task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
28
  task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
29
- task11 = Task("xcopa_it", "acc,none", "TruthfulQA")
30
 
31
 
32
  NUM_FEWSHOT = 0 # Change with your few shot
 
15
  # ---------------------------------------------------
16
  class Tasks(Enum):
17
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
 
18
  task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
19
  task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
20
+ task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
21
  task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
22
+ task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
23
+ task12 = Task("haspeede2_hs", "f1,none", "HaSpeeDe2 HS")
24
+ task13 = Task("haspeede2_stereo", "f1,none", "HaSpeeDe2 Stereo")
25
  task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
26
  task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
27
+ task14 = Task("ironita_irony", "f1,none", "IronITA Irony")
28
+ task15 = Task("ironita_sarcasm", "f1,none", "IronITA Sarcasm")
29
  task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
30
  task8 = Task("news_sum", "bertscore,none", "News Sum")
31
+ task16 = Task("sentipolc", "f1,none", "SENTIPOLC")
32
  task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
33
  task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
34
+ task11 = Task("xcopa_it", "acc,none", "XCOPA")
35
 
36
 
37
  NUM_FEWSHOT = 0 # Change with your few shot
src/leaderboard/read_evals.py CHANGED
@@ -11,6 +11,8 @@ from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
12
  from src.submission.check_validity import is_model_on_hub
13
 
 
 
14
 
15
  @dataclass
16
  class EvalResult:
@@ -80,6 +82,8 @@ class EvalResult:
80
  architecture = ";".join(architectures)
81
 
82
  # Extract results available in this file (some results are split in several files)
 
 
83
  results = {}
84
  for task in Tasks:
85
  task = task.value
@@ -102,6 +106,8 @@ class EvalResult:
102
 
103
  results[task.benchmark] = mean_acc
104
 
 
 
105
  return self(
106
  eval_name=result_key,
107
  full_model=full_model,
@@ -204,7 +210,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
204
  for model_result_filepath in model_result_filepaths:
205
  # Creation of result
206
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
207
- eval_result.update_with_request_file(requests_path)
 
208
 
209
  # Store results of same eval together
210
  eval_name = eval_result.eval_name
@@ -213,12 +220,14 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
213
  else:
214
  eval_results[eval_name] = eval_result
215
 
216
- results = []
217
- for v in eval_results.values():
218
  try:
219
  v.to_dict() # we test if the dict version is complete
220
- results.append(v)
221
- except KeyError: # not all eval values present
222
- continue
 
 
223
 
224
- return results
 
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
+ import pdb
15
+
16
 
17
  @dataclass
18
  class EvalResult:
 
82
  architecture = ";".join(architectures)
83
 
84
  # Extract results available in this file (some results are split in several files)
85
+
86
+ # pdb.set_trace()
87
  results = {}
88
  for task in Tasks:
89
  task = task.value
 
106
 
107
  results[task.benchmark] = mean_acc
108
 
109
+ # pdb.set_trace()
110
+
111
  return self(
112
  eval_name=result_key,
113
  full_model=full_model,
 
210
  for model_result_filepath in model_result_filepaths:
211
  # Creation of result
212
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
213
+
214
+ # eval_result.update_with_request_file(requests_path)
215
 
216
  # Store results of same eval together
217
  eval_name = eval_result.eval_name
 
220
  else:
221
  eval_results[eval_name] = eval_result
222
 
223
+ results_for_table = list()
224
+ for k, v in eval_results.items():
225
  try:
226
  v.to_dict() # we test if the dict version is complete
227
+ results_for_table.append(v)
228
+ except RuntimeError as e: # not all eval values present
229
+ print(f"Issue with results of: ", k)
230
+ raise e
231
+ # continue
232
 
233
+ return results_for_table
src/populate.py CHANGED
@@ -22,37 +22,37 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
22
  return raw_data, df
23
 
24
 
25
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
22
  return raw_data, df
23
 
24
 
25
+ # def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
+ # """Creates the different dataframes for the evaluation queues requestes"""
27
+ # entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
+ # all_evals = []
29
+
30
+ # for entry in entries:
31
+ # if ".json" in entry:
32
+ # file_path = os.path.join(save_path, entry)
33
+ # with open(file_path) as fp:
34
+ # data = json.load(fp)
35
+
36
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
+
39
+ # all_evals.append(data)
40
+ # elif ".md" not in entry:
41
+ # # this is a folder
42
+ # sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
43
+ # for sub_entry in sub_entries:
44
+ # file_path = os.path.join(save_path, entry, sub_entry)
45
+ # with open(file_path) as fp:
46
+ # data = json.load(fp)
47
+
48
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
+ # all_evals.append(data)
51
+
52
+ # pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
+ # running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
+ # finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
+ # df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
+ # df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
+ # df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
+ # return df_finished[cols], df_running[cols], df_pending[cols]