Spaces:

RiTA-nlp
/

ita-eval

Sleeping

App Files Files Community

g8a9 commited on May 12

Commit

e3d6a90

•

1 Parent(s): 1c41f75

bugfix

Browse files

Files changed (4) hide show

app.py +2 -2
src/about.py +8 -3
src/leaderboard/read_evals.py +16 -7
src/populate.py +34 -34

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ from src.display.utils import (
     Precision,
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
@@ -49,7 +49,7 @@ def restart_space():
 #     restart_space()
 try:
-    print(EVAL_RESULTS_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO,
         local_dir=EVAL_RESULTS_PATH,

     Precision,
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_leaderboard_df
 from src.submission.submit import add_new_eval
 #     restart_space()
 try:
+    print("Saving results locally at:", EVAL_RESULTS_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO,
         local_dir=EVAL_RESULTS_PATH,

src/about.py CHANGED Viewed

@@ -15,18 +15,23 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
     task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
     task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
-    task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
     task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
     task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
     task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
     task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
     task8 = Task("news_sum", "bertscore,none", "News Sum")
     task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
     task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
-    task11 = Task("xcopa_it", "acc,none", "TruthfulQA")
 NUM_FEWSHOT = 0  # Change with your few shot

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task1 = Task("ami_2020_aggressiveness", "f1,none", "AMI 2020 Agg")
     task2 = Task("ami_2020_misogyny", "f1,none", "AMI 2020 Miso")
+    task0 = Task("arc_challenge_ita", "acc_norm,none", "ARC-C")
     task4 = Task("belebele_ita", "acc_norm,none", "Belebele")
+    task3 = Task("gente_rephrasing", "acc,none", "GeNTE Neutralizing")
+    task12 = Task("haspeede2_hs", "f1,none", "HaSpeeDe2 HS")
+    task13 = Task("haspeede2_stereo", "f1,none", "HaSpeeDe2 Stereo")
     task5 = Task("hatecheck_ita", "f1,none", "HateCheck")
     task6 = Task("honest_ita", "acc,none", "HONEST", higher_is_better=False)
+    task14 = Task("ironita_irony", "f1,none", "IronITA Irony")
+    task15 = Task("ironita_sarcasm", "f1,none", "IronITA Sarcasm")
     task7 = Task("itacola", "mcc,none", "ItaCoLA", scale_by_100=False)
     task8 = Task("news_sum", "bertscore,none", "News Sum")
+    task16 = Task("sentipolc", "f1,none", "SENTIPOLC")
     task9 = Task("squad_it", "squad_f1,get-answer", "SQuAD it")
     task10 = Task("truthfulqa_mc2_ita", "acc,none", "TruthfulQA")
+    task11 = Task("xcopa_it", "acc,none", "XCOPA")
 NUM_FEWSHOT = 0  # Change with your few shot

src/leaderboard/read_evals.py CHANGED Viewed

@@ -11,6 +11,8 @@ from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
 from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
@@ -80,6 +82,8 @@ class EvalResult:
                 architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
         results = {}
         for task in Tasks:
             task = task.value
@@ -102,6 +106,8 @@ class EvalResult:
             results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
             full_model=full_model,
@@ -204,7 +210,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         eval_name = eval_result.eval_name
@@ -213,12 +220,14 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         else:
             eval_results[eval_name] = eval_result
-    results = []
-    for v in eval_results.values():
         try:
             v.to_dict()  # we test if the dict version is complete
-            results.append(v)
-        except KeyError:  # not all eval values present
-            continue
-    return results

 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, DisclosedType
 from src.submission.check_validity import is_model_on_hub
+import pdb
 @dataclass
 class EvalResult:
                 architecture = ";".join(architectures)
         # Extract results available in this file (some results are split in several files)
+        # pdb.set_trace()
         results = {}
         for task in Tasks:
             task = task.value
             results[task.benchmark] = mean_acc
+        # pdb.set_trace()
         return self(
             eval_name=result_key,
             full_model=full_model,
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        # eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         eval_name = eval_result.eval_name
         else:
             eval_results[eval_name] = eval_result
+    results_for_table = list()
+    for k, v in eval_results.items():
         try:
             v.to_dict()  # we test if the dict version is complete
+            results_for_table.append(v)
+        except RuntimeError as e:  # not all eval values present
+            print(f"Issue with results of: ", k)
+            raise e
+            # continue
+    return results_for_table

src/populate.py CHANGED Viewed

@@ -22,37 +22,37 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     return raw_data, df
-def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
-    """Creates the different dataframes for the evaluation queues requestes"""
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

     return raw_data, df
+# def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
+#     """Creates the different dataframes for the evaluation queues requestes"""
+#     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
+#     all_evals = []
+#     for entry in entries:
+#         if ".json" in entry:
+#             file_path = os.path.join(save_path, entry)
+#             with open(file_path) as fp:
+#                 data = json.load(fp)
+#             data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+#             data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+#             all_evals.append(data)
+#         elif ".md" not in entry:
+#             # this is a folder
+#             sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
+#             for sub_entry in sub_entries:
+#                 file_path = os.path.join(save_path, entry, sub_entry)
+#                 with open(file_path) as fp:
+#                     data = json.load(fp)
+#                 data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+#                 data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+#                 all_evals.append(data)
+#     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
+#     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+#     finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
+#     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
+#     df_running = pd.DataFrame.from_records(running_list, columns=cols)
+#     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+#     return df_finished[cols], df_running[cols], df_pending[cols]