open_pl_llm_leaderboard

Running on CPU Upgrade

djstrong commited on Feb 24

Commit

1f30b67

•

1 Parent(s): 476a46c

add 5-shot

Files changed (1) hide show

src/leaderboard/read_evals.py CHANGED Viewed

@@ -33,7 +33,7 @@ class EvalResult:
     still_on_hub: bool = False
     @classmethod
-    def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -74,7 +74,7 @@ class EvalResult:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k and n_shot.get(k, -1) == NUM_FEWSHOT])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
@@ -253,17 +253,18 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():

     still_on_hub: bool = False
     @classmethod
+    def init_from_json_file(self, json_filepath, n_shot_num):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
+            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k and n_shot.get(k, -1) == n_shot_num])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
+    for n_shot in [0,5]:
+        for model_result_filepath in model_result_filepaths:
+            # Creation of result
+            eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
+            eval_result.update_with_request_file(requests_path)
+            # Store results of same eval together
+            eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
+            if eval_name in eval_results.keys():
+                eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+            else:
+                eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():