Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
•
ec3a730
1
Parent(s):
92ec1df
Enable clean initialization without any evaluate model
Browse files- src/display/utils.py +8 -0
- src/populate.py +1 -0
- src/tools/plots.py +5 -2
src/display/utils.py
CHANGED
@@ -85,6 +85,14 @@ baseline_row = {
|
|
85 |
AutoEvalColumn.dummy.name: "baseline",
|
86 |
AutoEvalColumn.model_type.name: "",
|
87 |
AutoEvalColumn.flagged.name: False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
}
|
89 |
|
90 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
|
|
85 |
AutoEvalColumn.dummy.name: "baseline",
|
86 |
AutoEvalColumn.model_type.name: "",
|
87 |
AutoEvalColumn.flagged.name: False,
|
88 |
+
AutoEvalColumn.model_type_symbol.name: None,
|
89 |
+
AutoEvalColumn.architecture.name: None,
|
90 |
+
AutoEvalColumn.weight_type.name: None,
|
91 |
+
AutoEvalColumn.params.name: None,
|
92 |
+
AutoEvalColumn.likes.name: None,
|
93 |
+
AutoEvalColumn.license.name: None,
|
94 |
+
AutoEvalColumn.still_on_hub.name: None,
|
95 |
+
AutoEvalColumn.moe.name: None
|
96 |
}
|
97 |
|
98 |
# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
|
src/populate.py
CHANGED
@@ -17,6 +17,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
17 |
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
20 |
df = df[cols].round(decimals=2)
|
21 |
|
22 |
# filter out if any of the benchmarks have not been produced
|
|
|
17 |
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
20 |
+
|
21 |
df = df[cols].round(decimals=2)
|
22 |
|
23 |
# filter out if any of the benchmarks have not been produced
|
src/tools/plots.py
CHANGED
@@ -17,7 +17,10 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
17 |
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
18 |
"""
|
19 |
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
|
20 |
-
|
|
|
|
|
|
|
21 |
#results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
22 |
results_df.sort_values(by="date", inplace=True)
|
23 |
|
@@ -49,7 +52,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
49 |
last_date = current_date
|
50 |
|
51 |
# Step 4: Return all dictionaries as DataFrames
|
52 |
-
return {k: pd.DataFrame(v) for k, v in scores.items()}
|
53 |
|
54 |
|
55 |
def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
|
|
|
17 |
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
18 |
"""
|
19 |
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
|
20 |
+
|
21 |
+
#create dataframe with EvalResult dataclass columns, even if raw_data is empty
|
22 |
+
results_df = pd.DataFrame(raw_data, columns=EvalResult.__dataclass_fields__.keys())
|
23 |
+
|
24 |
#results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
25 |
results_df.sort_values(by="date", inplace=True)
|
26 |
|
|
|
52 |
last_date = current_date
|
53 |
|
54 |
# Step 4: Return all dictionaries as DataFrames
|
55 |
+
return {k: pd.DataFrame(v, columns=["model", "date", "score"]) for k, v in scores.items()}
|
56 |
|
57 |
|
58 |
def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
|