open_llm_leaderboard_old

Sleeping

App Files Files Community

SaylorTwift HF staff

alozowski HF staff commited on May 22

Commit

092c345

•

1 Parent(s): 10a491c

apply-ruff (#748)

Browse files

- updated makefile (be37fd7da09122feb960037baf94ebb38d4fe2fc)
- updated makefile (50c352c7deb1b0501d4bd8ee8f33b6dff9ba3595)
- apply code style and quality checks to app.py (86c3dd57010f6b3edc41df0c8ebe3f31e265f995)
- apply code style and quality checks to envs.py (0fed1ec893f6a24e3effcdd33275221a2690bb6d)
- apply code style and quality checks to populate.py (bb51465efd2159c70e28d9a37a4fdb64262d52cf)
- apply code style and quality checks to utils.py (9d989a45c8a2d503357e93f4d69286f7971ee9de)
- apply code style and quality checks to filter_models.py (9b7814c00d977c716c5f06b4248bd6470043e01c)
- apply code style and quality checks to read_evals.py (d95d4a16bcb27159054316ed125c5d80769d91df)

Co-authored-by: Alina Lozovskaya <[email protected]>

Files changed (7) hide show

Makefile +14 -9
app.py +26 -27
src/display/utils.py +4 -3
src/envs.py +0 -2
src/leaderboard/filter_models.py +2 -3
src/leaderboard/read_evals.py +27 -29
src/populate.py +1 -3

Makefile CHANGED Viewed

@@ -1,13 +1,18 @@
-.PHONY: style format
 style:
-	python -m black --line-length 119 .
-	python -m isort .
-	ruff check --fix .
 quality:
-	python -m black --check --line-length 119 .
-	python -m isort --check-only .
-	ruff check .

+.PHONY: style format quality all
+# Applies code style fixes to the specified file or directory
 style:
+	@echo "Applying style fixes to $(file)"
+	ruff format $(file)
+	ruff check --fix $(file) --line-length 119
+# Checks code quality for the specified file or directory
 quality:
+	@echo "Checking code quality for $(file)"
+	ruff check $(file) --line-length 119
+# Applies PEP8 formatting and checks the entire codebase
+all:
+	@echo "Formatting and checking the entire codebase"
+	ruff format .
+	ruff check --fix . --line-length 119

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import pandas as pd
 import logging
 import time
 import gradio as gr
@@ -23,8 +22,6 @@ from src.display.utils import (
     COLS,
     EVAL_COLS,
     EVAL_TYPES,
-    NUMERIC_INTERVALS,
-    TYPES,
     AutoEvalColumn,
     ModelType,
     Precision,
@@ -51,11 +48,12 @@ from src.tools.collections import update_collections
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Start ephemeral Spaces on PRs (see config in README.md)
 enable_space_ci()
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
@@ -68,6 +66,7 @@ def time_diff_wrapper(func):
         diff = end_time - start_time
         logging.info(f"Time taken for {func.__name__}: {diff} seconds")
         return result
     return wrapper
@@ -89,12 +88,13 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
             logging.info("Download successful")
             return
         except Exception as e:
-            wait_time = backoff_factor ** attempt
             logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
             time.sleep(wait_time)
             attempt += 1
     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
 def init_space(full_init: bool = True):
     """Initializes the application space, loading only necessary data."""
     if full_init:
@@ -120,12 +120,13 @@ def init_space(full_init: bool = True):
         update_collections(original_df)
     leaderboard_df = original_df.copy()
     # Evaluation queue DataFrame retrieval is independent of initialization detail level
     eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
     return leaderboard_df, raw_data, original_df, eval_queue_dfs
 # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
 # This controls whether a full initialization should be performed.
 do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
@@ -153,36 +154,34 @@ with demo:
                 value=leaderboard_df,
                 datatype=[c.type for c in fields(AutoEvalColumn)],
                 select_columns=SelectColumns(
-                    default_selection=[
-                        c.name
-                        for c in fields(AutoEvalColumn)
-                        if c.displayed_by_default
-                    ],
                     cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
                     label="Select Columns to Display:",
                 ),
-                search_columns=[
-                    AutoEvalColumn.model.name,
-                    AutoEvalColumn.fullname.name,
-                    AutoEvalColumn.license.name
-                ],
-                hide_columns=[
-                    c.name
-                    for c in fields(AutoEvalColumn)
-                    if c.hidden
-                ],
                 filter_columns=[
                     ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
                     ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-                    ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
-                    ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True),
-                    ColumnFilter(AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True),
                     ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
                     ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
                 ],
-                bool_checkboxgroup_label="Hide models"
             )
         with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
                 with gr.Column():
@@ -313,4 +312,4 @@ scheduler.add_job(restart_space, "interval", hours=3)  # restarted every 3h
 scheduler.add_job(update_dynamic_files, "interval", hours=2)  # launched every 2 hour
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import os
 import logging
 import time
 import gradio as gr
     COLS,
     EVAL_COLS,
     EVAL_TYPES,
     AutoEvalColumn,
     ModelType,
     Precision,
 from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
 # Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Start ephemeral Spaces on PRs (see config in README.md)
 enable_space_ci()
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
         diff = end_time - start_time
         logging.info(f"Time taken for {func.__name__}: {diff} seconds")
         return result
     return wrapper
             logging.info("Download successful")
             return
         except Exception as e:
+            wait_time = backoff_factor**attempt
             logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
             time.sleep(wait_time)
             attempt += 1
     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
 def init_space(full_init: bool = True):
     """Initializes the application space, loading only necessary data."""
     if full_init:
         update_collections(original_df)
     leaderboard_df = original_df.copy()
     # Evaluation queue DataFrame retrieval is independent of initialization detail level
     eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
     return leaderboard_df, raw_data, original_df, eval_queue_dfs
 # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
 # This controls whether a full initialization should be performed.
 do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
                 value=leaderboard_df,
                 datatype=[c.type for c in fields(AutoEvalColumn)],
                 select_columns=SelectColumns(
+                    default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
                     cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
                     label="Select Columns to Display:",
                 ),
+                search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
+                hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
                 filter_columns=[
                     ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
                     ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+                    ColumnFilter(
+                        AutoEvalColumn.params.name,
+                        type="slider",
+                        min=0,
+                        max=150,
+                        label="Select the number of parameters (B)",
+                    ),
+                    ColumnFilter(
+                        AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
+                    ),
+                    ColumnFilter(
+                        AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
+                    ),
                     ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
                     ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
                 ],
+                bool_checkboxgroup_label="Hide models",
             )
         with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
             with gr.Row():
                 with gr.Column():
 scheduler.add_job(update_dynamic_files, "interval", hours=2)  # launched every 2 hour
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

src/display/utils.py CHANGED Viewed

@@ -7,7 +7,8 @@ import pandas as pd
 # Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 def parse_datetime(datetime_str):
     formats = [
@@ -15,7 +16,7 @@ def parse_datetime(datetime_str):
         "%Y-%m-%dT%H:%M:%S.%f",  # Standard format with colons
         "%Y-%m-%dT%H %M %S.%f",  # Spaces as separator
     ]
     for fmt in formats:
         try:
             return datetime.strptime(datetime_str, fmt)
@@ -25,6 +26,7 @@ def parse_datetime(datetime_str):
     logging.error(f"No valid date format found for: {datetime_str}")
     return datetime(1970, 1, 1)
 def load_json_data(file_path):
     """Safely load JSON data from a file."""
     try:
@@ -98,7 +100,6 @@ auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)

 # Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 def parse_datetime(datetime_str):
     formats = [
         "%Y-%m-%dT%H:%M:%S.%f",  # Standard format with colons
         "%Y-%m-%dT%H %M %S.%f",  # Spaces as separator
     ]
     for fmt in formats:
         try:
             return datetime.strptime(datetime_str, fmt)
     logging.error(f"No valid date format found for: {datetime_str}")
     return datetime(1970, 1, 1)
 def load_json_data(file_path):
     """Safely load JSON data from a file."""
     try:
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)

src/envs.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import os
-import logging
 from huggingface_hub import HfApi
 # clone / pull the lmeh eval data

 import os
 from huggingface_hub import HfApi
 # clone / pull the lmeh eval data

src/leaderboard/filter_models.py CHANGED Viewed

@@ -137,9 +137,9 @@ def flag_models(leaderboard_data: list[dict]):
         if model_data[AutoEvalColumn.not_flagged.name]:
             flag_key = model_data[AutoEvalColumn.fullname.name]
         else:
-             # Merges and moes are flagged
             flag_key = "merged"
         # Reverse the logic: Check for non-flagged models instead
         if flag_key in FLAGGED_MODELS:
             issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
@@ -171,4 +171,3 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
 def filter_models_flags(leaderboard_data: list[dict]):
     leaderboard_data = remove_forbidden_models(leaderboard_data)
     flag_models(leaderboard_data)

         if model_data[AutoEvalColumn.not_flagged.name]:
             flag_key = model_data[AutoEvalColumn.fullname.name]
         else:
+            # Merges and moes are flagged
             flag_key = "merged"
         # Reverse the logic: Check for non-flagged models instead
         if flag_key in FLAGGED_MODELS:
             issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
 def filter_models_flags(leaderboard_data: list[dict]):
     leaderboard_data = remove_forbidden_models(leaderboard_data)
     flag_models(leaderboard_data)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -16,36 +16,36 @@ from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
 # Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 @dataclass
 class EvalResult:
     # Also see src.display.utils.AutoEvalColumn for what will be displayed.
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
     org: Optional[str]
     model: str
-    revision: str # commit hash, "" if main
     results: Dict[str, float]
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original
-    architecture: str = "Unknown" # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = True
     is_merge: bool = False
     not_flagged: bool = False
     status: str = "FINISHED"
     # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
     tags: List[str] = field(default_factory=list)
     @classmethod
-    def init_from_json_file(cls, json_filepath: str) -> 'EvalResult':
-        with open(json_filepath, 'r') as fp:
             data = json.load(fp)
         config = data.get("config_general", {})
@@ -72,7 +72,7 @@ class EvalResult:
             model=model,
             results=results,
             precision=precision,
-            revision=config.get("model_sha", "")
         )
     @staticmethod
@@ -118,9 +118,8 @@ class EvalResult:
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
-        return results
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it."""
@@ -130,17 +129,17 @@ class EvalResult:
                 logging.warning(f"No request file for {self.org}/{self.model}")
                 self.status = "FAILED"
                 return
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.num_params = int(request.get("params", 0))  # Ensuring type safety
             self.date = request.get("submitted_time", "")
             self.architecture = request.get("architectures", "Unknown")
             self.status = request.get("status", "FAILED")
         except FileNotFoundError:
             self.status = "FAILED"
             logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
@@ -154,7 +153,6 @@ class EvalResult:
             self.status = "FAILED"
             logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
     def update_with_dynamic_file_dict(self, file_dict):
         """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
         # Default values set for optional or potentially missing keys.
@@ -162,11 +160,10 @@ class EvalResult:
         self.likes = int(file_dict.get("likes", 0))  # Ensure likes is treated as an integer
         self.still_on_hub = file_dict.get("still_on_hub", False)  # Default to False if key is missing
         self.tags = file_dict.get("tags", [])
         # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
         self.not_flagged = not (any("flagged" in tag for tag in self.tags))
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
@@ -185,8 +182,10 @@ class EvalResult:
             AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-            AutoEvalColumn.merged.name: not( "merge" in self.tags if self.tags else False),
-            AutoEvalColumn.moe.name: not ( ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()) ,
             AutoEvalColumn.not_flagged.name: self.not_flagged,
         }
@@ -194,16 +193,16 @@ class EvalResult:
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
 def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     requests_path = Path(requests_path)
     pattern = f"{model_name}_eval_request_*.json"
     # Using pathlib to find files matching the pattern
     request_files = list(requests_path.glob(pattern))
     # Sort the files by name in descending order to mimic 'reverse=True'
     request_files.sort(reverse=True)
@@ -214,7 +213,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
             req_content = json.load(f)
             if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
                 request_file = str(request_file)
     # Return empty string if no file found that matches criteria
     return request_file
@@ -223,9 +222,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
     """From the path of the results folder root, extract all needed info for results"""
     with open(dynamic_path) as f:
         dynamic_data = json.load(f)
     results_path = Path(results_path)
-    model_files = list(results_path.rglob('results_*.json'))
     model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
     eval_results = {}
@@ -260,4 +259,3 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
             continue
     return results

 from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
 # Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 @dataclass
 class EvalResult:
     # Also see src.display.utils.AutoEvalColumn for what will be displayed.
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
     org: Optional[str]
     model: str
+    revision: str  # commit hash, "" if main
     results: Dict[str, float]
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original
+    architecture: str = "Unknown"  # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = True
     is_merge: bool = False
     not_flagged: bool = False
     status: str = "FINISHED"
     # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
     tags: List[str] = field(default_factory=list)
     @classmethod
+    def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
+        with open(json_filepath, "r") as fp:
             data = json.load(fp)
         config = data.get("config_general", {})
             model=model,
             results=results,
             precision=precision,
+            revision=config.get("model_sha", ""),
         )
     @staticmethod
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
+        return results
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it."""
                 logging.warning(f"No request file for {self.org}/{self.model}")
                 self.status = "FAILED"
                 return
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.num_params = int(request.get("params", 0))  # Ensuring type safety
             self.date = request.get("submitted_time", "")
             self.architecture = request.get("architectures", "Unknown")
             self.status = request.get("status", "FAILED")
         except FileNotFoundError:
             self.status = "FAILED"
             logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
             self.status = "FAILED"
             logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
     def update_with_dynamic_file_dict(self, file_dict):
         """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
         # Default values set for optional or potentially missing keys.
         self.likes = int(file_dict.get("likes", 0))  # Ensure likes is treated as an integer
         self.still_on_hub = file_dict.get("still_on_hub", False)  # Default to False if key is missing
         self.tags = file_dict.get("tags", [])
         # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
         self.not_flagged = not (any("flagged" in tag for tag in self.tags))
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
             AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+            AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
+            AutoEvalColumn.moe.name: not (
+                ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
+            ),
             AutoEvalColumn.not_flagged.name: self.not_flagged,
         }
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
 def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     requests_path = Path(requests_path)
     pattern = f"{model_name}_eval_request_*.json"
     # Using pathlib to find files matching the pattern
     request_files = list(requests_path.glob(pattern))
     # Sort the files by name in descending order to mimic 'reverse=True'
     request_files.sort(reverse=True)
             req_content = json.load(f)
             if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
                 request_file = str(request_file)
     # Return empty string if no file found that matches criteria
     return request_file
     """From the path of the results folder root, extract all needed info for results"""
     with open(dynamic_path) as f:
         dynamic_data = json.load(f)
     results_path = Path(results_path)
+    model_files = list(results_path.rglob("results_*.json"))
     model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
     eval_results = {}
             continue
     return results

src/populate.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import json
-import os
 import pathlib
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
@@ -21,7 +19,7 @@ def get_evaluation_queue_df(save_path, cols):
     save_path = pathlib.Path(save_path)
     all_evals = []
-    for path in save_path.rglob('*.json'):
         data = load_json_data(path)
         if data:
             all_evals.append(_process_model_data(data))

 import pathlib
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
     save_path = pathlib.Path(save_path)
     all_evals = []
+    for path in save_path.rglob("*.json"):
         data = load_json_data(path)
         if data:
             all_evals.append(_process_model_data(data))