yzabc007 commited on
Commit
04e5831
1 Parent(s): c1e1ef9

Update space

Browse files
app.py CHANGED
@@ -105,7 +105,8 @@ def init_leaderboard(dataframe):
105
  # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
  # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
  # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
108
- model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
 
109
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
110
 
111
 
@@ -169,13 +170,13 @@ with demo:
169
  # AutoEvalColumn.rank_overall.name,
170
  AutoEvalColumn.model.name,
171
  AutoEvalColumn.rank_overall.name,
172
- AutoEvalColumn.rank_math_algebra.name,
173
- AutoEvalColumn.rank_math_geometry.name,
174
  AutoEvalColumn.rank_math_probability.name,
175
  AutoEvalColumn.rank_reason_logical.name,
176
- AutoEvalColumn.rank_reason_social.name,
177
  AutoEvalColumn.rank_chemistry.name,
178
- AutoEvalColumn.rank_cpp.name,
179
  ],
180
  rank_col=[],
181
  )
@@ -264,7 +265,7 @@ with demo:
264
  AutoEvalColumn.rank_math_probability.name,
265
  AutoEvalColumn.model.name,
266
  AutoEvalColumn.score_math_probability.name,
267
- AutoEvalColumn.sd_math_probability.name,
268
  AutoEvalColumn.license.name,
269
  AutoEvalColumn.organization.name,
270
  AutoEvalColumn.knowledge_cutoff.name,
 
105
  # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
  # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
  # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
108
+ # model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
109
+ model_result_path = "./src/results/models_2024-10-18-14:06:13.588399.json"
110
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
111
 
112
 
 
170
  # AutoEvalColumn.rank_overall.name,
171
  AutoEvalColumn.model.name,
172
  AutoEvalColumn.rank_overall.name,
173
+ # AutoEvalColumn.rank_math_algebra.name,
174
+ # AutoEvalColumn.rank_math_geometry.name,
175
  AutoEvalColumn.rank_math_probability.name,
176
  AutoEvalColumn.rank_reason_logical.name,
177
+ # AutoEvalColumn.rank_reason_social.name,
178
  AutoEvalColumn.rank_chemistry.name,
179
+ # AutoEvalColumn.rank_cpp.name,
180
  ],
181
  rank_col=[],
182
  )
 
265
  AutoEvalColumn.rank_math_probability.name,
266
  AutoEvalColumn.model.name,
267
  AutoEvalColumn.score_math_probability.name,
268
+ # AutoEvalColumn.sd_math_probability.name,
269
  AutoEvalColumn.license.name,
270
  AutoEvalColumn.organization.name,
271
  AutoEvalColumn.knowledge_cutoff.name,
src/leaderboard/read_evals.py CHANGED
@@ -164,26 +164,26 @@ class ModelResult:
164
  # AutoEvalColumn.rank_reason_logical.name: int(self.results.get("Logical").get("Rank", None)),
165
  # AutoEvalColumn.rank_reason_social.name: int(self.results.get("Social").get("Rank", None)),
166
 
167
- AutoEvalColumn.score_overall.name: self.results.get("OVERALL").get("Average Score", None),
168
- AutoEvalColumn.score_math_algebra.name: self.results.get("Algebra").get("Average Score", None),
169
- AutoEvalColumn.score_math_geometry.name: self.results.get("Geometry").get("Average Score", None),
170
- AutoEvalColumn.score_math_probability.name: self.results.get("Probability").get("Average Score", None),
171
- AutoEvalColumn.score_reason_logical.name: self.results.get("Logical").get("Average Score", None),
172
- AutoEvalColumn.score_reason_social.name: self.results.get("Social").get("Average Score", None),
173
 
174
- AutoEvalColumn.sd_overall.name: self.results.get("OVERALL").get("Standard Deviation", None),
175
- AutoEvalColumn.sd_math_algebra.name: self.results.get("Algebra").get("Standard Deviation", None),
176
- AutoEvalColumn.sd_math_geometry.name: self.results.get("Geometry").get("Standard Deviation", None),
177
- AutoEvalColumn.sd_math_probability.name: self.results.get("Probability").get("Standard Deviation", None),
178
- AutoEvalColumn.sd_reason_logical.name: self.results.get("Logical").get("Standard Deviation", None),
179
- AutoEvalColumn.sd_reason_social.name: self.results.get("Social").get("Standard Deviation", None),
180
-
181
- AutoEvalColumn.rank_overall.name: self.results.get("OVERALL").get("Rank", None),
182
- AutoEvalColumn.rank_math_algebra.name: self.results.get("Algebra").get("Rank", None),
183
- AutoEvalColumn.rank_math_geometry.name: self.results.get("Geometry").get("Rank", None),
184
- AutoEvalColumn.rank_math_probability.name: self.results.get("Probability").get("Rank", None),
185
- AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
186
- AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
187
 
188
  AutoEvalColumn.score_chemistry.name: self.results.get("Chemistry").get("Average Score", None) if self.results.get("Chemistry") else None,
189
  AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
 
164
  # AutoEvalColumn.rank_reason_logical.name: int(self.results.get("Logical").get("Rank", None)),
165
  # AutoEvalColumn.rank_reason_social.name: int(self.results.get("Social").get("Rank", None)),
166
 
167
+ AutoEvalColumn.score_overall.name: self.results.get("OVERALL").get("Average Score", None) if self.results.get("OVERALL") else None,
168
+ AutoEvalColumn.score_math_algebra.name: self.results.get("Algebra").get("Average Score", None) if self.results.get("Algebra") else None,
169
+ AutoEvalColumn.score_math_geometry.name: self.results.get("Geometry").get("Average Score", None) if self.results.get("Geometry") else None,
170
+ AutoEvalColumn.score_math_probability.name: self.results.get("Probability").get("Average Score", None) if self.results.get("Probability") else None,
171
+ AutoEvalColumn.score_reason_logical.name: self.results.get("Logical").get("Average Score", None) if self.results.get("Logical") else None,
172
+ AutoEvalColumn.score_reason_social.name: self.results.get("Social").get("Average Score", None) if self.results.get("Social") else None,
173
 
174
+ AutoEvalColumn.sd_overall.name: self.results.get("OVERALL").get("Standard Deviation", None) if self.results.get("OVERALL") else None,
175
+ AutoEvalColumn.sd_math_algebra.name: self.results.get("Algebra").get("Standard Deviation", None) if self.results.get("Algebra") else None,
176
+ AutoEvalColumn.sd_math_geometry.name: self.results.get("Geometry").get("Standard Deviation", None) if self.results.get("Geometry") else None,
177
+ AutoEvalColumn.sd_math_probability.name: self.results.get("Probability").get("Standard Deviation", None) if self.results.get("Probability") else None,
178
+ AutoEvalColumn.sd_reason_logical.name: self.results.get("Logical").get("Standard Deviation", None) if self.results.get("Logical") else None,
179
+ AutoEvalColumn.sd_reason_social.name: self.results.get("Social").get("Standard Deviation", None) if self.results.get("Social") else None,
180
+
181
+ AutoEvalColumn.rank_overall.name: self.results.get("OVERALL").get("Rank", None) if self.results.get("OVERALL") else None,
182
+ AutoEvalColumn.rank_math_algebra.name: self.results.get("Algebra").get("Rank", None) if self.results.get("Algebra") else None,
183
+ AutoEvalColumn.rank_math_geometry.name: self.results.get("Geometry").get("Rank", None) if self.results.get("Geometry") else None,
184
+ AutoEvalColumn.rank_math_probability.name: self.results.get("Probability").get("Rank", None) if self.results.get("Probability") else None,
185
+ AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None) if self.results.get("Logical") else None,
186
+ AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None) if self.results.get("Social") else None,
187
 
188
  AutoEvalColumn.score_chemistry.name: self.results.get("Chemistry").get("Average Score", None) if self.results.get("Chemistry") else None,
189
  AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
src/populate.py CHANGED
@@ -19,15 +19,16 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
19
  df = pd.DataFrame.from_records(all_data_json)
20
 
21
  df = df[benchmark_cols]
22
- # print(df.head())
23
 
24
  if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
  df = df.dropna(subset=benchmark_cols)
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
27
  # print(rank_col, benchmark_cols)
 
28
  else:
29
  # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
30
- avg_rank = df.iloc[:, 1:].mean(axis=1)
31
  df["Average Rank"] = avg_rank.round(decimals=4)
32
  df = df.sort_values(by=["Average Rank"], ascending=True)
33
  df["Average Rank"] = df["Average Rank"].map('{:.4f}'.format)
@@ -43,11 +44,15 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
43
  # print(col)
44
  # if 'Std dev' in col or 'Score' in col:
45
  if 'Std dev' in col or 'Score' in col:
46
- if "Chemistry" in col or "C++" in col:
 
 
 
 
47
  df[col] = (df[col]).map('{:.2f}'.format)
48
  else:
49
  df[col] = (df[col]*100).map('{:.2f}'.format)
50
- # df[col] = df[col].round(decimals=2)
51
 
52
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
53
  # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
 
21
  df = df[benchmark_cols]
22
+ print(df.head())
23
 
24
  if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
  df = df.dropna(subset=benchmark_cols)
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
27
  # print(rank_col, benchmark_cols)
28
+ # print(df.head())
29
  else:
30
  # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
31
+ avg_rank = df.iloc[:, 1:].mean(axis=1)
32
  df["Average Rank"] = avg_rank.round(decimals=4)
33
  df = df.sort_values(by=["Average Rank"], ascending=True)
34
  df["Average Rank"] = df["Average Rank"].map('{:.4f}'.format)
 
44
  # print(col)
45
  # if 'Std dev' in col or 'Score' in col:
46
  if 'Std dev' in col or 'Score' in col:
47
+ # if set(['Chemistry', 'Reasoning']).intersection(set(col.split())):
48
+ # df[col] = (df[col]).map('{:.2f}'.format)
49
+ # else:
50
+ # df[col] = (df[col]*100).map('{:.2f}'.format)
51
+ if "Chemistry" in col or "C++" in col or "Overall" in col or "Probability" in col or "Logical" in col:
52
  df[col] = (df[col]).map('{:.2f}'.format)
53
  else:
54
  df[col] = (df[col]*100).map('{:.2f}'.format)
55
+ df[col] = df[col].round(decimals=2)
56
 
57
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
58
  # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
src/results/models_2024-10-18-14:06:13.588399.json ADDED
@@ -0,0 +1,2732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "config": {
4
+ "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
+ "organization": "OpenAI",
6
+ "license": "Proprietary",
7
+ "knowledge_cutoff": "2023/10"
8
+ },
9
+ "results": {
10
+ "OVERALL": {
11
+ "Average Score": 87.33082346779815,
12
+ "Standard Deviation": 1.4853337406399776,
13
+ "Rank": 3
14
+ },
15
+ "Geometry": {
16
+ "Average Score": 0.976028578,
17
+ "Standard Deviation": 0.01507912373,
18
+ "Rank": 3
19
+ },
20
+ "Algebra": {
21
+ "Average Score": 0.951199453,
22
+ "Standard Deviation": 0.08452452108,
23
+ "Rank": 3
24
+ },
25
+ "Probability": {
26
+ "Average Score": 80.1332207690739,
27
+ "Standard Deviation": null,
28
+ "Rank": 7
29
+ },
30
+ "Logical": {
31
+ "Average Score": 84.12975867250425,
32
+ "Standard Deviation": 0.21211547702245045,
33
+ "Rank": 6
34
+ },
35
+ "Social": {
36
+ "Average Score": 0.815902987,
37
+ "Standard Deviation": 0.0196254222,
38
+ "Rank": 3
39
+ },
40
+ "Chemistry": {
41
+ "Average Score": 89.92480228064885,
42
+ "Standard Deviation": null,
43
+ "Rank": 4
44
+ },
45
+ "CPP": {
46
+ "Average Score": 100.0,
47
+ "Standard Deviation": null,
48
+ "Rank": 1
49
+ }
50
+ }
51
+ },
52
+ {
53
+ "config": {
54
+ "model_name": "gpt-4o-2024-08-06",
55
+ "organization": "OpenAI",
56
+ "license": "Proprietary",
57
+ "knowledge_cutoff": "2023/10"
58
+ },
59
+ "results": {
60
+ "OVERALL": {
61
+ "Average Score": 77.7818546246671,
62
+ "Standard Deviation": 2.7097581088879505,
63
+ "Rank": 5
64
+ },
65
+ "Geometry": {
66
+ "Average Score": 0.99773096,
67
+ "Standard Deviation": 0.002835555172,
68
+ "Rank": 1
69
+ },
70
+ "Algebra": {
71
+ "Average Score": 1.0,
72
+ "Standard Deviation": 0.0,
73
+ "Rank": 1
74
+ },
75
+ "Probability": {
76
+ "Average Score": 74.97136205481755,
77
+ "Standard Deviation": null,
78
+ "Rank": 11
79
+ },
80
+ "Logical": {
81
+ "Average Score": 66.0597109743056,
82
+ "Standard Deviation": 1.5021351704575163,
83
+ "Rank": 14
84
+ },
85
+ "Social": {
86
+ "Average Score": 0.680417314,
87
+ "Standard Deviation": 0.00656867063,
88
+ "Rank": 8
89
+ },
90
+ "Chemistry": {
91
+ "Average Score": 82.55189735524202,
92
+ "Standard Deviation": null,
93
+ "Rank": 7
94
+ },
95
+ "CPP": {
96
+ "Average Score": 92.43090226400756,
97
+ "Standard Deviation": null,
98
+ "Rank": 2
99
+ }
100
+ }
101
+ },
102
+ {
103
+ "config": {
104
+ "model_name": "gpt-4o-2024-05-13",
105
+ "organization": "OpenAI",
106
+ "license": "Proprietary",
107
+ "knowledge_cutoff": "2023/10"
108
+ },
109
+ "results": {
110
+ "OVERALL": {
111
+ "Average Score": 72.6093654197998,
112
+ "Standard Deviation": 13.515345690976028,
113
+ "Rank": 10
114
+ },
115
+ "Geometry": {
116
+ "Average Score": 0.972472377,
117
+ "Standard Deviation": 0.01648274205,
118
+ "Rank": 4
119
+ },
120
+ "Algebra": {
121
+ "Average Score": 0.995511298,
122
+ "Standard Deviation": 0.004097802515,
123
+ "Rank": 2
124
+ },
125
+ "Probability": {
126
+ "Average Score": 77.97816201050715,
127
+ "Standard Deviation": null,
128
+ "Rank": 8
129
+ },
130
+ "Logical": {
131
+ "Average Score": 75.65058939137873,
132
+ "Standard Deviation": 0.07522785572103825,
133
+ "Rank": 9
134
+ },
135
+ "Social": {
136
+ "Average Score": 0.609875087,
137
+ "Standard Deviation": 0.038729239,
138
+ "Rank": 13
139
+ },
140
+ "Chemistry": {
141
+ "Average Score": 76.03377031297643,
142
+ "Standard Deviation": null,
143
+ "Rank": 9
144
+ },
145
+ "CPP": {
146
+ "Average Score": 79.1592634699295,
147
+ "Standard Deviation": null,
148
+ "Rank": 6
149
+ }
150
+ }
151
+ },
152
+ {
153
+ "config": {
154
+ "model_name": "gpt-4-turbo-2024-04-09",
155
+ "organization": "OpenAI",
156
+ "license": "Proprietary",
157
+ "knowledge_cutoff": "2023/12"
158
+ },
159
+ "results": {
160
+ "OVERALL": {
161
+ "Average Score": 73.32308543749606,
162
+ "Standard Deviation": 6.562777844134629,
163
+ "Rank": 9
164
+ },
165
+ "Geometry": {
166
+ "Average Score": 0.95374588,
167
+ "Standard Deviation": 0.03109307166,
168
+ "Rank": 5
169
+ },
170
+ "Algebra": {
171
+ "Average Score": 0.930945223,
172
+ "Standard Deviation": 0.06705136813,
173
+ "Rank": 4
174
+ },
175
+ "Probability": {
176
+ "Average Score": 74.97144205445957,
177
+ "Standard Deviation": null,
178
+ "Rank": 12
179
+ },
180
+ "Logical": {
181
+ "Average Score": 76.82291715624933,
182
+ "Standard Deviation": 0.03462548327631355,
183
+ "Rank": 7
184
+ },
185
+ "Social": {
186
+ "Average Score": 0.715935163,
187
+ "Standard Deviation": 0.1209141409,
188
+ "Rank": 6
189
+ },
190
+ "Chemistry": {
191
+ "Average Score": 70.44329321394066,
192
+ "Standard Deviation": null,
193
+ "Rank": 12
194
+ },
195
+ "CPP": {
196
+ "Average Score": 70.73143363230263,
197
+ "Standard Deviation": null,
198
+ "Rank": 11
199
+ }
200
+ }
201
+ },
202
+ {
203
+ "config": {
204
+ "model_name": "gemini-1.5-pro-001",
205
+ "organization": "Google",
206
+ "license": "Proprietary",
207
+ "knowledge_cutoff": "2023/11"
208
+ },
209
+ "results": {
210
+ "OVERALL": {
211
+ "Average Score": 74.27365448117855,
212
+ "Standard Deviation": 3.9515447172901847,
213
+ "Rank": 8
214
+ },
215
+ "Geometry": {
216
+ "Average Score": 0.9947169,
217
+ "Standard Deviation": 0.009150597621,
218
+ "Rank": 2
219
+ },
220
+ "Algebra": {
221
+ "Average Score": 0.857464301,
222
+ "Standard Deviation": 0.05014285338,
223
+ "Rank": 5
224
+ },
225
+ "Probability": {
226
+ "Average Score": 64.77713215500482,
227
+ "Standard Deviation": null,
228
+ "Rank": 15
229
+ },
230
+ "Logical": {
231
+ "Average Score": 74.3275461555815,
232
+ "Standard Deviation": 0.8092355737847541,
233
+ "Rank": 10
234
+ },
235
+ "Social": {
236
+ "Average Score": 0.649601885,
237
+ "Standard Deviation": 0.104854889,
238
+ "Rank": 11
239
+ }
240
+ }
241
+ },
242
+ {
243
+ "config": {
244
+ "model_name": "qwen2-72b-instruct",
245
+ "organization": "Alibaba",
246
+ "license": "Qianwen LICENSE",
247
+ "knowledge_cutoff": "2024/09"
248
+ },
249
+ "results": {
250
+ "OVERALL": {
251
+ "Average Score": 71.00423311357184,
252
+ "Standard Deviation": 1.6189609141983887,
253
+ "Rank": 12
254
+ },
255
+ "Geometry": {
256
+ "Average Score": 0.796870305,
257
+ "Standard Deviation": 0.0509025346,
258
+ "Rank": 9
259
+ },
260
+ "Algebra": {
261
+ "Average Score": 0.836194231,
262
+ "Standard Deviation": 0.04517093028,
263
+ "Rank": 6
264
+ },
265
+ "Probability": {
266
+ "Average Score": 76.33751777233937,
267
+ "Standard Deviation": null,
268
+ "Rank": 10
269
+ },
270
+ "Logical": {
271
+ "Average Score": 61.22020517318166,
272
+ "Standard Deviation": 10.241399997578569,
273
+ "Rank": 17
274
+ },
275
+ "Social": {
276
+ "Average Score": 0.652578786,
277
+ "Standard Deviation": 0.04259293171,
278
+ "Rank": 10
279
+ },
280
+ "Chemistry": {
281
+ "Average Score": 70.44342338869497,
282
+ "Standard Deviation": null,
283
+ "Rank": 12
284
+ },
285
+ "CPP": {
286
+ "Average Score": 73.54037778797029,
287
+ "Standard Deviation": null,
288
+ "Rank": 7
289
+ }
290
+ }
291
+ },
292
+ {
293
+ "config": {
294
+ "model_name": "gpt-4o-mini-2024-07-18",
295
+ "organization": "OpenAI",
296
+ "license": "Proprietary",
297
+ "knowledge_cutoff": "2023/10"
298
+ },
299
+ "results": {
300
+ "OVERALL": {
301
+ "Average Score": 77.35427394420829,
302
+ "Standard Deviation": 3.162321541714492,
303
+ "Rank": 6
304
+ },
305
+ "Geometry": {
306
+ "Average Score": 0.946650435,
307
+ "Standard Deviation": 0.01831236482,
308
+ "Rank": 7
309
+ },
310
+ "Algebra": {
311
+ "Average Score": 0.796243022,
312
+ "Standard Deviation": 0.05537539202,
313
+ "Rank": 7
314
+ },
315
+ "Probability": {
316
+ "Average Score": 77.63972720989734,
317
+ "Standard Deviation": null,
318
+ "Rank": 9
319
+ },
320
+ "Logical": {
321
+ "Average Score": 71.81267717239906,
322
+ "Standard Deviation": 0.3393593163824375,
323
+ "Rank": 11
324
+ },
325
+ "Social": {
326
+ "Average Score": 0.691949855,
327
+ "Standard Deviation": 0.02072934333,
328
+ "Rank": 7
329
+ },
330
+ "Chemistry": {
331
+ "Average Score": 78.10636943659426,
332
+ "Standard Deviation": null,
333
+ "Rank": 8
334
+ },
335
+ "CPP": {
336
+ "Average Score": 88.3877070580296,
337
+ "Standard Deviation": null,
338
+ "Rank": 3
339
+ }
340
+ }
341
+ },
342
+ {
343
+ "config": {
344
+ "model_name": "claude-3.5-sonnet",
345
+ "organization": "Anthropic",
346
+ "license": "Proprietary",
347
+ "knowledge_cutoff": "2024/04"
348
+ },
349
+ "results": {
350
+ "OVERALL": {
351
+ "Average Score": 75.97534774560863,
352
+ "Standard Deviation": 9.237316832705584,
353
+ "Rank": 7
354
+ },
355
+ "Geometry": {
356
+ "Average Score": 0.95316419,
357
+ "Standard Deviation": 0.02081192856,
358
+ "Rank": 6
359
+ },
360
+ "Algebra": {
361
+ "Average Score": 0.759789952,
362
+ "Standard Deviation": 0.02611765096,
363
+ "Rank": 8
364
+ },
365
+ "Probability": {
366
+ "Average Score": 65.4531881044298,
367
+ "Standard Deviation": null,
368
+ "Rank": 14
369
+ },
370
+ "Logical": {
371
+ "Average Score": 76.47424588300288,
372
+ "Standard Deviation": 0.07699328617321737,
373
+ "Rank": 8
374
+ },
375
+ "Social": {
376
+ "Average Score": 0.790002247,
377
+ "Standard Deviation": 0.1007410022,
378
+ "Rank": 4
379
+ },
380
+ "Chemistry": {
381
+ "Average Score": 85.17654674052096,
382
+ "Standard Deviation": null,
383
+ "Rank": 6
384
+ },
385
+ "CPP": {
386
+ "Average Score": 82.37734076815008,
387
+ "Standard Deviation": null,
388
+ "Rank": 5
389
+ }
390
+ }
391
+ },
392
+ {
393
+ "config": {
394
+ "model_name": "o1-mini",
395
+ "organization": "OpenAI",
396
+ "license": "Proprietary",
397
+ "knowledge_cutoff": "2023/10"
398
+ },
399
+ "results": {
400
+ "OVERALL": {
401
+ "Average Score": 87.92989248183513,
402
+ "Standard Deviation": 1.3401058431409953,
403
+ "Rank": 2
404
+ },
405
+ "Geometry": {
406
+ "Average Score": "N/A",
407
+ "Standard Deviation": "N/A",
408
+ "Rank": "N/A"
409
+ },
410
+ "Algebra": {
411
+ "Average Score": "N/A",
412
+ "Standard Deviation": "N/A",
413
+ "Rank": "N/A"
414
+ },
415
+ "Probability": {
416
+ "Average Score": 100.0,
417
+ "Standard Deviation": null,
418
+ "Rank": 1
419
+ },
420
+ "Logical": {
421
+ "Average Score": 99.15920225407733,
422
+ "Standard Deviation": 0.49801294410288666,
423
+ "Rank": 2
424
+ },
425
+ "Social": {
426
+ "Average Score": 0.993974241,
427
+ "Standard Deviation": 0.001996882328,
428
+ "Rank": 2
429
+ }
430
+ }
431
+ },
432
+ {
433
+ "config": {
434
+ "model_name": "o1-preview",
435
+ "organization": "OpenAI",
436
+ "license": "Proprietary",
437
+ "knowledge_cutoff": "2023/10"
438
+ },
439
+ "results": {
440
+ "OVERALL": {
441
+ "Average Score": 85.40247108906188,
442
+ "Standard Deviation": 1.5796898764998464,
443
+ "Rank": 4
444
+ },
445
+ "Geometry": {
446
+ "Average Score": "N/A",
447
+ "Standard Deviation": "N/A",
448
+ "Rank": "N/A"
449
+ },
450
+ "Algebra": {
451
+ "Average Score": "N/A",
452
+ "Standard Deviation": "N/A",
453
+ "Rank": "N/A"
454
+ },
455
+ "Probability": {
456
+ "Average Score": 90.32625019320989,
457
+ "Standard Deviation": null,
458
+ "Rank": 5
459
+ },
460
+ "Logical": {
461
+ "Average Score": 98.18241651273537,
462
+ "Standard Deviation": 0.16231417987288874,
463
+ "Rank": 4
464
+ },
465
+ "Social": {
466
+ "Average Score": 1.0,
467
+ "Standard Deviation": 0.0,
468
+ "Rank": 1
469
+ }
470
+ }
471
+ },
472
+ {
473
+ "config": {
474
+ "model_name": "gemini-1.5-flash-001",
475
+ "organization": "Google",
476
+ "license": "Proprietary",
477
+ "knowledge_cutoff": "2023/11"
478
+ },
479
+ "results": {
480
+ "OVERALL": {
481
+ "Average Score": 67.67997467963976,
482
+ "Standard Deviation": 2.624276751646549,
483
+ "Rank": 13
484
+ },
485
+ "Geometry": {
486
+ "Average Score": 0.804144103,
487
+ "Standard Deviation": 0.1327142178,
488
+ "Rank": 8
489
+ },
490
+ "Algebra": {
491
+ "Average Score": 0.731776765,
492
+ "Standard Deviation": 0.02594657111,
493
+ "Rank": 9
494
+ },
495
+ "Probability": {
496
+ "Average Score": 61.17190439316032,
497
+ "Standard Deviation": null,
498
+ "Rank": 19
499
+ },
500
+ "Logical": {
501
+ "Average Score": 62.284381466778335,
502
+ "Standard Deviation": 3.9592476945909674,
503
+ "Rank": 16
504
+ },
505
+ "Social": {
506
+ "Average Score": 0.555933822,
507
+ "Standard Deviation": 0.1029934524,
508
+ "Rank": 15
509
+ },
510
+ "Chemistry": {
511
+ "Average Score": 70.24726462490831,
512
+ "Standard Deviation": null,
513
+ "Rank": 15
514
+ },
515
+ "CPP": {
516
+ "Average Score": 72.1127762005651,
517
+ "Standard Deviation": null,
518
+ "Rank": 10
519
+ }
520
+ }
521
+ },
522
+ {
523
+ "config": {
524
+ "model_name": "gpt4-1106",
525
+ "organization": "OpenAI",
526
+ "license": "Proprietary",
527
+ "knowledge_cutoff": "2024/04"
528
+ },
529
+ "results": {
530
+ "OVERALL": {
531
+ "Average Score": 72.24829405851214,
532
+ "Standard Deviation": 13.633826990442946,
533
+ "Rank": 11
534
+ },
535
+ "Geometry": {
536
+ "Average Score": 0.71843088,
537
+ "Standard Deviation": 0.04778038294,
538
+ "Rank": 11
539
+ },
540
+ "Algebra": {
541
+ "Average Score": 0.712910417,
542
+ "Standard Deviation": 0.02581828898,
543
+ "Rank": 10
544
+ },
545
+ "Probability": {
546
+ "Average Score": 63.29462909293814,
547
+ "Standard Deviation": null,
548
+ "Rank": 16
549
+ },
550
+ "Logical": {
551
+ "Average Score": 62.987098158883875,
552
+ "Standard Deviation": 4.027795425350514,
553
+ "Rank": 15
554
+ },
555
+ "Social": {
556
+ "Average Score": 0.450609816,
557
+ "Standard Deviation": 0.05208655446,
558
+ "Rank": 21
559
+ },
560
+ "Chemistry": {
561
+ "Average Score": 67.34047237109209,
562
+ "Standard Deviation": null,
563
+ "Rank": 16
564
+ },
565
+ "CPP": {
566
+ "Average Score": 69.11824072252848,
567
+ "Standard Deviation": null,
568
+ "Rank": 12
569
+ }
570
+ }
571
+ },
572
+ {
573
+ "config": {
574
+ "model_name": "gemma-2-27b-it",
575
+ "organization": "Google",
576
+ "license": "Gemma License",
577
+ "knowledge_cutoff": "2024/06"
578
+ },
579
+ "results": {
580
+ "OVERALL": {
581
+ "Average Score": 62.70975283121063,
582
+ "Standard Deviation": 6.376450054715319,
583
+ "Rank": 15
584
+ },
585
+ "Geometry": {
586
+ "Average Score": 0.60112744,
587
+ "Standard Deviation": 0.0469109952,
588
+ "Rank": 17
589
+ },
590
+ "Algebra": {
591
+ "Average Score": 0.687955914,
592
+ "Standard Deviation": 0.01959958192,
593
+ "Rank": 11
594
+ },
595
+ "Probability": {
596
+ "Average Score": 60.04180799425261,
597
+ "Standard Deviation": null,
598
+ "Rank": 20
599
+ },
600
+ "Logical": {
601
+ "Average Score": 60.77082327163094,
602
+ "Standard Deviation": 7.2164902432618625,
603
+ "Rank": 19
604
+ },
605
+ "Social": {
606
+ "Average Score": 0.487844257,
607
+ "Standard Deviation": 0.05857760809,
608
+ "Rank": 18
609
+ },
610
+ "Chemistry": {
611
+ "Average Score": 61.68181926111706,
612
+ "Standard Deviation": null,
613
+ "Rank": 18
614
+ },
615
+ "CPP": {
616
+ "Average Score": 63.28920072143611,
617
+ "Standard Deviation": null,
618
+ "Rank": 14
619
+ }
620
+ }
621
+ },
622
+ {
623
+ "config": {
624
+ "model_name": "claude-3-opus",
625
+ "organization": "Anthropic",
626
+ "license": "Proprietary",
627
+ "knowledge_cutoff": "2023/08"
628
+ },
629
+ "results": {
630
+ "OVERALL": {
631
+ "Average Score": 60.56449573632771,
632
+ "Standard Deviation": 8.485936885427277,
633
+ "Rank": 17
634
+ },
635
+ "Geometry": {
636
+ "Average Score": 0.7215743,
637
+ "Standard Deviation": 0.04712598358,
638
+ "Rank": 10
639
+ },
640
+ "Algebra": {
641
+ "Average Score": 0.68777327,
642
+ "Standard Deviation": 0.02382683713,
643
+ "Rank": 12
644
+ },
645
+ "Probability": {
646
+ "Average Score": 62.296041016641176,
647
+ "Standard Deviation": null,
648
+ "Rank": 17
649
+ },
650
+ "Logical": {
651
+ "Average Score": 68.36295609287292,
652
+ "Standard Deviation": 1.6558271236588655,
653
+ "Rank": 13
654
+ },
655
+ "Social": {
656
+ "Average Score": 0.663410854,
657
+ "Standard Deviation": 0.09540220876,
658
+ "Rank": 9
659
+ },
660
+ "Chemistry": {
661
+ "Average Score": 70.44337273504232,
662
+ "Standard Deviation": null,
663
+ "Rank": 12
664
+ },
665
+ "CPP": {
666
+ "Average Score": 73.5404403567132,
667
+ "Standard Deviation": null,
668
+ "Rank": 8
669
+ }
670
+ }
671
+ },
672
+ {
673
+ "config": {
674
+ "model_name": "gemma-2-9b-it-simpo",
675
+ "organization": "Google",
676
+ "license": "Gemma License",
677
+ "knowledge_cutoff": "2024/07"
678
+ },
679
+ "results": {
680
+ "OVERALL": {
681
+ "Average Score": "N/A",
682
+ "Standard Deviation": "N/A",
683
+ "Rank": "N/A"
684
+ },
685
+ "Geometry": {
686
+ "Average Score": 0.582787508,
687
+ "Standard Deviation": 0.03965204074,
688
+ "Rank": 18
689
+ },
690
+ "Algebra": {
691
+ "Average Score": 0.658648133,
692
+ "Standard Deviation": 0.02565919856,
693
+ "Rank": 13
694
+ },
695
+ "Probability": {
696
+ "Average Score": 57.545408188912894,
697
+ "Standard Deviation": null,
698
+ "Rank": 23
699
+ },
700
+ "Logical": {
701
+ "Average Score": 53.1996479262466,
702
+ "Standard Deviation": 2.690106544431167,
703
+ "Rank": 23
704
+ },
705
+ "Social": {
706
+ "Average Score": 0.635266187,
707
+ "Standard Deviation": 0.03620021751,
708
+ "Rank": 12
709
+ },
710
+ "Chemistry": {
711
+ "Average Score": 74.44267231381626,
712
+ "Standard Deviation": null,
713
+ "Rank": 11
714
+ },
715
+ "CPP": {
716
+ "Average Score": 73.43757596214863,
717
+ "Standard Deviation": null,
718
+ "Rank": 9
719
+ }
720
+ }
721
+ },
722
+ {
723
+ "config": {
724
+ "model_name": "qwen1.5-72b-chat",
725
+ "organization": "Alibaba",
726
+ "license": "Qianwen LICENSE",
727
+ "knowledge_cutoff": "2024/03"
728
+ },
729
+ "results": {
730
+ "OVERALL": {
731
+ "Average Score": 52.983715751652085,
732
+ "Standard Deviation": 3.097613966427763,
733
+ "Rank": 18
734
+ },
735
+ "Geometry": {
736
+ "Average Score": 0.543139301,
737
+ "Standard Deviation": 0.03425202326,
738
+ "Rank": 22
739
+ },
740
+ "Algebra": {
741
+ "Average Score": 0.635228729,
742
+ "Standard Deviation": 0.01944043425,
743
+ "Rank": 14
744
+ },
745
+ "Probability": {
746
+ "Average Score": 52.650033879924905,
747
+ "Standard Deviation": null,
748
+ "Rank": 26
749
+ },
750
+ "Logical": {
751
+ "Average Score": 32.628853250402074,
752
+ "Standard Deviation": 3.227745519436025,
753
+ "Rank": 37
754
+ },
755
+ "Social": {
756
+ "Average Score": 0.415007627,
757
+ "Standard Deviation": 0.03920053159,
758
+ "Rank": 22
759
+ },
760
+ "Chemistry": {
761
+ "Average Score": 47.5126781973184,
762
+ "Standard Deviation": null,
763
+ "Rank": 24
764
+ },
765
+ "CPP": {
766
+ "Average Score": 48.69302376665551,
767
+ "Standard Deviation": null,
768
+ "Rank": 20
769
+ }
770
+ }
771
+ },
772
+ {
773
+ "config": {
774
+ "model_name": "qwen1.5-32b-chat",
775
+ "organization": "Alibaba",
776
+ "license": "Qianwen LICENSE",
777
+ "knowledge_cutoff": "2024/03"
778
+ },
779
+ "results": {
780
+ "OVERALL": {
781
+ "Average Score": 26.978561942890224,
782
+ "Standard Deviation": 1.575986887925592,
783
+ "Rank": 32
784
+ },
785
+ "Geometry": {
786
+ "Average Score": 0.51086835,
787
+ "Standard Deviation": 0.04052471998,
788
+ "Rank": 25
789
+ },
790
+ "Algebra": {
791
+ "Average Score": 0.609003168,
792
+ "Standard Deviation": 0.04874143541,
793
+ "Rank": 15
794
+ },
795
+ "Probability": {
796
+ "Average Score": 49.50617919486678,
797
+ "Standard Deviation": null,
798
+ "Rank": 29
799
+ },
800
+ "Logical": {
801
+ "Average Score": 34.07387941414556,
802
+ "Standard Deviation": 4.616974831074921,
803
+ "Rank": 34
804
+ },
805
+ "Social": {
806
+ "Average Score": 0.380987334,
807
+ "Standard Deviation": 0.03762251776,
808
+ "Rank": 24
809
+ },
810
+ "Chemistry": {
811
+ "Average Score": 44.06627265183811,
812
+ "Standard Deviation": null,
813
+ "Rank": 28
814
+ },
815
+ "CPP": {
816
+ "Average Score": 45.14284028264288,
817
+ "Standard Deviation": null,
818
+ "Rank": 24
819
+ }
820
+ }
821
+ },
822
+ {
823
+ "config": {
824
+ "model_name": "google-gemma-2-9b-it",
825
+ "organization": "Google",
826
+ "license": "Proprietary",
827
+ "knowledge_cutoff": "2024/06"
828
+ },
829
+ "results": {
830
+ "OVERALL": {
831
+ "Average Score": 52.23013018580635,
832
+ "Standard Deviation": 3.3939236141078495,
833
+ "Rank": 19
834
+ },
835
+ "Geometry": {
836
+ "Average Score": 0.575371308,
837
+ "Standard Deviation": 0.03556220251,
838
+ "Rank": 20
839
+ },
840
+ "Algebra": {
841
+ "Average Score": 0.597045661,
842
+ "Standard Deviation": 0.0313828123,
843
+ "Rank": 16
844
+ },
845
+ "Probability": {
846
+ "Average Score": 58.73062101843859,
847
+ "Standard Deviation": null,
848
+ "Rank": 21
849
+ },
850
+ "Logical": {
851
+ "Average Score": 58.01791397899675,
852
+ "Standard Deviation": 5.751983660134971,
853
+ "Rank": 21
854
+ },
855
+ "Social": {
856
+ "Average Score": 0.768337958,
857
+ "Standard Deviation": 0.04078610476,
858
+ "Rank": 5
859
+ },
860
+ "Chemistry": {
861
+ "Average Score": 52.69494515004607,
862
+ "Standard Deviation": null,
863
+ "Rank": 21
864
+ },
865
+ "CPP": {
866
+ "Average Score": 54.03167523687635,
867
+ "Standard Deviation": null,
868
+ "Rank": 17
869
+ }
870
+ }
871
+ },
872
+ {
873
+ "config": {
874
+ "model_name": "yi-1.5-34b-chat",
875
+ "organization": "01 AI",
876
+ "license": "Proprietary",
877
+ "knowledge_cutoff": "2024/05"
878
+ },
879
+ "results": {
880
+ "OVERALL": {
881
+ "Average Score": 62.568637878216464,
882
+ "Standard Deviation": 8.554205798418673,
883
+ "Rank": 16
884
+ },
885
+ "Geometry": {
886
+ "Average Score": 0.566666724,
887
+ "Standard Deviation": 0.04001381658,
888
+ "Rank": 21
889
+ },
890
+ "Algebra": {
891
+ "Average Score": 0.590997292,
892
+ "Standard Deviation": 0.03594087315,
893
+ "Rank": 17
894
+ },
895
+ "Probability": {
896
+ "Average Score": 57.545207891104354,
897
+ "Standard Deviation": null,
898
+ "Rank": 22
899
+ },
900
+ "Logical": {
901
+ "Average Score": 56.598158131627194,
902
+ "Standard Deviation": 1.1072821075127297,
903
+ "Rank": 22
904
+ },
905
+ "Social": {
906
+ "Average Score": 0.516980832,
907
+ "Standard Deviation": 0.03369347985,
908
+ "Rank": 17
909
+ },
910
+ "Chemistry": {
911
+ "Average Score": 50.867343712131174,
912
+ "Standard Deviation": null,
913
+ "Rank": 22
914
+ },
915
+ "CPP": {
916
+ "Average Score": 52.148798061768964,
917
+ "Standard Deviation": null,
918
+ "Rank": 18
919
+ }
920
+ }
921
+ },
922
+ {
923
+ "config": {
924
+ "model_name": "meta-llama-3.1-70b-instruct",
925
+ "organization": "Meta",
926
+ "license": "Llama 3.1 Community",
927
+ "knowledge_cutoff": "2023/12"
928
+ },
929
+ "results": {
930
+ "OVERALL": {
931
+ "Average Score": 65.61302047306724,
932
+ "Standard Deviation": 7.113338386318571,
933
+ "Rank": 14
934
+ },
935
+ "Geometry": {
936
+ "Average Score": 0.76184398,
937
+ "Standard Deviation": 0.01790377984,
938
+ "Rank": 10
939
+ },
940
+ "Algebra": {
941
+ "Average Score": 0.732041699,
942
+ "Standard Deviation": 0.02621439062,
943
+ "Rank": 9
944
+ },
945
+ "Probability": {
946
+ "Average Score": 65.4531285887158,
947
+ "Standard Deviation": null,
948
+ "Rank": 13
949
+ },
950
+ "Logical": {
951
+ "Average Score": 61.16321386785366,
952
+ "Standard Deviation": 0.8920966760646541,
953
+ "Rank": 18
954
+ },
955
+ "Social": {
956
+ "Average Score": 0.45872939,
957
+ "Standard Deviation": 0.05347039576,
958
+ "Rank": 20
959
+ },
960
+ "Chemistry": {
961
+ "Average Score": 76.03374498429748,
962
+ "Standard Deviation": null,
963
+ "Rank": 9
964
+ },
965
+ "CPP": {
966
+ "Average Score": 84.36815192532764,
967
+ "Standard Deviation": null,
968
+ "Rank": 4
969
+ }
970
+ }
971
+ },
972
+ {
973
+ "config": {
974
+ "model_name": "meta-llama-3.1-8b-instruct",
975
+ "organization": "Meta",
976
+ "license": "Llama 3.1 Community",
977
+ "knowledge_cutoff": "2023/12"
978
+ },
979
+ "results": {
980
+ "OVERALL": {
981
+ "Average Score": 48.86242501618216,
982
+ "Standard Deviation": 3.7761459978540257,
983
+ "Rank": 21
984
+ },
985
+ "Geometry": {
986
+ "Average Score": 0.522442162,
987
+ "Standard Deviation": 0.03908236317,
988
+ "Rank": 23
989
+ },
990
+ "Algebra": {
991
+ "Average Score": 0.582702645,
992
+ "Standard Deviation": 0.05002277711,
993
+ "Rank": 18
994
+ },
995
+ "Probability": {
996
+ "Average Score": 52.44179989233465,
997
+ "Standard Deviation": null,
998
+ "Rank": 27
999
+ },
1000
+ "Logical": {
1001
+ "Average Score": 43.3706774850582,
1002
+ "Standard Deviation": 2.820707319899787,
1003
+ "Rank": 28
1004
+ },
1005
+ "Social": {
1006
+ "Average Score": 0.329195941,
1007
+ "Standard Deviation": 0.03925019528,
1008
+ "Rank": 28
1009
+ },
1010
+ "Chemistry": {
1011
+ "Average Score": 43.36264580455019,
1012
+ "Standard Deviation": null,
1013
+ "Rank": 30
1014
+ },
1015
+ "CPP": {
1016
+ "Average Score": 44.41846841004584,
1017
+ "Standard Deviation": null,
1018
+ "Rank": 26
1019
+ }
1020
+ }
1021
+ },
1022
+ {
1023
+ "config": {
1024
+ "model_name": "gpt3.5-turbo-0125",
1025
+ "organization": "OpenAI",
1026
+ "license": "Proprietary",
1027
+ "knowledge_cutoff": "2021/09"
1028
+ },
1029
+ "results": {
1030
+ "OVERALL": {
1031
+ "Average Score": 18.951737690142235,
1032
+ "Standard Deviation": 0.7967088395458379,
1033
+ "Rank": 42
1034
+ },
1035
+ "Geometry": {
1036
+ "Average Score": 0.678714519,
1037
+ "Standard Deviation": 0.05926546762,
1038
+ "Rank": 12
1039
+ },
1040
+ "Algebra": {
1041
+ "Average Score": 0.569296173,
1042
+ "Standard Deviation": 0.05277281097,
1043
+ "Rank": 19
1044
+ },
1045
+ "Probability": {
1046
+ "Average Score": 45.77959177088119,
1047
+ "Standard Deviation": null,
1048
+ "Rank": 30
1049
+ },
1050
+ "Logical": {
1051
+ "Average Score": 17.159084771200394,
1052
+ "Standard Deviation": 2.5845422782742546,
1053
+ "Rank": 48
1054
+ },
1055
+ "Social": {
1056
+ "Average Score": 0.235071541,
1057
+ "Standard Deviation": 0.02632892457,
1058
+ "Rank": 37
1059
+ },
1060
+ "Chemistry": {
1061
+ "Average Score": 39.52885225927276,
1062
+ "Standard Deviation": null,
1063
+ "Rank": 33
1064
+ },
1065
+ "CPP": {
1066
+ "Average Score": 40.46958736582551,
1067
+ "Standard Deviation": null,
1068
+ "Rank": 29
1069
+ }
1070
+ }
1071
+ },
1072
+ {
1073
+ "config": {
1074
+ "model_name": "llama-3-70b-instruct",
1075
+ "organization": "Meta",
1076
+ "license": "Llama 3 Community",
1077
+ "knowledge_cutoff": "2023/12"
1078
+ },
1079
+ "results": {
1080
+ "OVERALL": {
1081
+ "Average Score": 40.57810915454436,
1082
+ "Standard Deviation": 1.3134243733127455,
1083
+ "Rank": 26
1084
+ },
1085
+ "Geometry": {
1086
+ "Average Score": 0.516865529,
1087
+ "Standard Deviation": 0.03858112564,
1088
+ "Rank": 24
1089
+ },
1090
+ "Algebra": {
1091
+ "Average Score": 0.566756531,
1092
+ "Standard Deviation": 0.03369826926,
1093
+ "Rank": 20
1094
+ },
1095
+ "Probability": {
1096
+ "Average Score": 52.64997876875813,
1097
+ "Standard Deviation": null,
1098
+ "Rank": 25
1099
+ },
1100
+ "Logical": {
1101
+ "Average Score": 70.51651844158742,
1102
+ "Standard Deviation": 0.12355022869457871,
1103
+ "Rank": 12
1104
+ },
1105
+ "Social": {
1106
+ "Average Score": 0.45872939,
1107
+ "Standard Deviation": 0.05347039576,
1108
+ "Rank": 20
1109
+ },
1110
+ "Chemistry": {
1111
+ "Average Score": 63.65476403379996,
1112
+ "Standard Deviation": null,
1113
+ "Rank": 17
1114
+ },
1115
+ "CPP": {
1116
+ "Average Score": 65.32140697218945,
1117
+ "Standard Deviation": null,
1118
+ "Rank": 13
1119
+ }
1120
+ }
1121
+ },
1122
+ {
1123
+ "config": {
1124
+ "model_name": "claude-3-sonnet",
1125
+ "organization": "Anthropic",
1126
+ "license": "Proprietary",
1127
+ "knowledge_cutoff": "2023/08"
1128
+ },
1129
+ "results": {
1130
+ "OVERALL": {
1131
+ "Average Score": 52.19088595402735,
1132
+ "Standard Deviation": 3.743258734262917,
1133
+ "Rank": 20
1134
+ },
1135
+ "Geometry": {
1136
+ "Average Score": 0.675613638,
1137
+ "Standard Deviation": 0.05275594408,
1138
+ "Rank": 13
1139
+ },
1140
+ "Algebra": {
1141
+ "Average Score": 0.552025728,
1142
+ "Standard Deviation": 0.04122192409,
1143
+ "Rank": 21
1144
+ },
1145
+ "Probability": {
1146
+ "Average Score": 54.0284459891417,
1147
+ "Standard Deviation": null,
1148
+ "Rank": 24
1149
+ },
1150
+ "Logical": {
1151
+ "Average Score": 58.099761779812475,
1152
+ "Standard Deviation": 7.815595203680491,
1153
+ "Rank": 20
1154
+ },
1155
+ "Social": {
1156
+ "Average Score": 0.570437582,
1157
+ "Standard Deviation": 0.08607040862,
1158
+ "Rank": 14
1159
+ },
1160
+ "Chemistry": {
1161
+ "Average Score": 59.784958090634056,
1162
+ "Standard Deviation": null,
1163
+ "Rank": 19
1164
+ },
1165
+ "CPP": {
1166
+ "Average Score": 61.33538592327427,
1167
+ "Standard Deviation": null,
1168
+ "Rank": 15
1169
+ }
1170
+ }
1171
+ },
1172
+ {
1173
+ "config": {
1174
+ "model_name": "qwen1.5-14b-chat",
1175
+ "organization": "Alibaba",
1176
+ "license": "Qianwen LICENSE",
1177
+ "knowledge_cutoff": "2024/02"
1178
+ },
1179
+ "results": {
1180
+ "OVERALL": {
1181
+ "Average Score": 31.56999734729493,
1182
+ "Standard Deviation": 5.42704987916441,
1183
+ "Rank": 29
1184
+ },
1185
+ "Geometry": {
1186
+ "Average Score": 0.452504016,
1187
+ "Standard Deviation": 0.04225594393,
1188
+ "Rank": 26
1189
+ },
1190
+ "Algebra": {
1191
+ "Average Score": 0.538655725,
1192
+ "Standard Deviation": 0.03721542594,
1193
+ "Rank": 22
1194
+ },
1195
+ "Probability": {
1196
+ "Average Score": 41.027908758027046,
1197
+ "Standard Deviation": null,
1198
+ "Rank": 35
1199
+ },
1200
+ "Logical": {
1201
+ "Average Score": 31.638560769720616,
1202
+ "Standard Deviation": 3.175225377796435,
1203
+ "Rank": 38
1204
+ },
1205
+ "Social": {
1206
+ "Average Score": 0.287370142,
1207
+ "Standard Deviation": 0.04264085315,
1208
+ "Rank": 30
1209
+ },
1210
+ "Chemistry": {
1211
+ "Average Score": 37.667977565724996,
1212
+ "Standard Deviation": null,
1213
+ "Rank": 35
1214
+ },
1215
+ "CPP": {
1216
+ "Average Score": 38.552779976347026,
1217
+ "Standard Deviation": null,
1218
+ "Rank": 31
1219
+ }
1220
+ }
1221
+ },
1222
+ {
1223
+ "config": {
1224
+ "model_name": "claude-3-haiku",
1225
+ "organization": "Anthropic",
1226
+ "license": "Proprietary",
1227
+ "knowledge_cutoff": "2023/08"
1228
+ },
1229
+ "results": {
1230
+ "OVERALL": {
1231
+ "Average Score": 42.975259650014074,
1232
+ "Standard Deviation": 2.248602505751528,
1233
+ "Rank": 25
1234
+ },
1235
+ "Geometry": {
1236
+ "Average Score": 0.607993912,
1237
+ "Standard Deviation": 0.05793460748,
1238
+ "Rank": 15
1239
+ },
1240
+ "Algebra": {
1241
+ "Average Score": 0.520054055,
1242
+ "Standard Deviation": 0.03333544511,
1243
+ "Rank": 23
1244
+ },
1245
+ "Probability": {
1246
+ "Average Score": 52.44184603289214,
1247
+ "Standard Deviation": null,
1248
+ "Rank": 28
1249
+ },
1250
+ "Logical": {
1251
+ "Average Score": 50.38523351226464,
1252
+ "Standard Deviation": 1.9928131873345676,
1253
+ "Rank": 24
1254
+ },
1255
+ "Social": {
1256
+ "Average Score": 0.551083976,
1257
+ "Standard Deviation": 0.05374722539,
1258
+ "Rank": 16
1259
+ },
1260
+ "Chemistry": {
1261
+ "Average Score": 54.99584531372778,
1262
+ "Standard Deviation": null,
1263
+ "Rank": 20
1264
+ },
1265
+ "CPP": {
1266
+ "Average Score": 56.40200048817984,
1267
+ "Standard Deviation": null,
1268
+ "Rank": 16
1269
+ }
1270
+ }
1271
+ },
1272
+ {
1273
+ "config": {
1274
+ "model_name": "claude-2.1",
1275
+ "organization": "Anthropic",
1276
+ "license": "Proprietary",
1277
+ "knowledge_cutoff": "Unknown"
1278
+ },
1279
+ "results": {
1280
+ "OVERALL": {
1281
+ "Average Score": 23.82704986290717,
1282
+ "Standard Deviation": 1.6337262681919007,
1283
+ "Rank": 37
1284
+ },
1285
+ "Geometry": {
1286
+ "Average Score": 0.62752395,
1287
+ "Standard Deviation": 0.07232659398,
1288
+ "Rank": 14
1289
+ },
1290
+ "Algebra": {
1291
+ "Average Score": 0.508849609,
1292
+ "Standard Deviation": 0.0346897465,
1293
+ "Rank": 24
1294
+ },
1295
+ "Probability": {
1296
+ "Average Score": 42.82280874207299,
1297
+ "Standard Deviation": null,
1298
+ "Rank": 32
1299
+ },
1300
+ "Logical": {
1301
+ "Average Score": 47.40647506260718,
1302
+ "Standard Deviation": 3.5140099122016686,
1303
+ "Rank": 25
1304
+ },
1305
+ "Social": {
1306
+ "Average Score": 0.333804568,
1307
+ "Standard Deviation": 0.03775548253,
1308
+ "Rank": 27
1309
+ },
1310
+ "Chemistry": {
1311
+ "Average Score": 46.09889239661357,
1312
+ "Standard Deviation": null,
1313
+ "Rank": 25
1314
+ },
1315
+ "CPP": {
1316
+ "Average Score": 47.23672563994903,
1317
+ "Standard Deviation": null,
1318
+ "Rank": 21
1319
+ }
1320
+ }
1321
+ },
1322
+ {
1323
+ "config": {
1324
+ "model_name": "mistral-8x7b-instruct-v0.1",
1325
+ "organization": "Mistral",
1326
+ "license": "Apache 2.0",
1327
+ "knowledge_cutoff": "2023/12"
1328
+ },
1329
+ "results": {
1330
+ "OVERALL": {
1331
+ "Average Score": 26.279729527476174,
1332
+ "Standard Deviation": 1.7823676900027476,
1333
+ "Rank": 33
1334
+ },
1335
+ "Geometry": {
1336
+ "Average Score": 0.432216097,
1337
+ "Standard Deviation": 0.04747949254,
1338
+ "Rank": 29
1339
+ },
1340
+ "Algebra": {
1341
+ "Average Score": 0.478314888,
1342
+ "Standard Deviation": 0.01998797419,
1343
+ "Rank": 25
1344
+ },
1345
+ "Probability": {
1346
+ "Average Score": 42.27303178662447,
1347
+ "Standard Deviation": null,
1348
+ "Rank": 33
1349
+ },
1350
+ "Logical": {
1351
+ "Average Score": 34.58281320758576,
1352
+ "Standard Deviation": 2.5548927504271073,
1353
+ "Rank": 33
1354
+ },
1355
+ "Social": {
1356
+ "Average Score": 0.251949622,
1357
+ "Standard Deviation": 0.03346674405,
1358
+ "Rank": 35
1359
+ },
1360
+ "Chemistry": {
1361
+ "Average Score": 43.47423835615602,
1362
+ "Standard Deviation": null,
1363
+ "Rank": 29
1364
+ },
1365
+ "CPP": {
1366
+ "Average Score": 44.533118241976666,
1367
+ "Standard Deviation": null,
1368
+ "Rank": 25
1369
+ }
1370
+ }
1371
+ },
1372
+ {
1373
+ "config": {
1374
+ "model_name": "claude-2.0",
1375
+ "organization": "Anthropic",
1376
+ "license": "Proprietary",
1377
+ "knowledge_cutoff": "Unknown"
1378
+ },
1379
+ "results": {
1380
+ "OVERALL": {
1381
+ "Average Score": 20.490629074737296,
1382
+ "Standard Deviation": 0.4821482730133453,
1383
+ "Rank": 40
1384
+ },
1385
+ "Geometry": {
1386
+ "Average Score": 0.604141967,
1387
+ "Standard Deviation": 0.05116441826,
1388
+ "Rank": 16
1389
+ },
1390
+ "Algebra": {
1391
+ "Average Score": 0.474350734,
1392
+ "Standard Deviation": 0.01510393066,
1393
+ "Rank": 26
1394
+ },
1395
+ "Probability": {
1396
+ "Average Score": 45.15580067803421,
1397
+ "Standard Deviation": null,
1398
+ "Rank": 31
1399
+ },
1400
+ "Logical": {
1401
+ "Average Score": 43.65660021552717,
1402
+ "Standard Deviation": 4.959029305063026,
1403
+ "Rank": 27
1404
+ },
1405
+ "Social": {
1406
+ "Average Score": 0.469422836,
1407
+ "Standard Deviation": 0.05999901796,
1408
+ "Rank": 19
1409
+ },
1410
+ "Chemistry": {
1411
+ "Average Score": 49.53201090067431,
1412
+ "Standard Deviation": null,
1413
+ "Rank": 23
1414
+ },
1415
+ "CPP": {
1416
+ "Average Score": 50.773143448036464,
1417
+ "Standard Deviation": null,
1418
+ "Rank": 19
1419
+ }
1420
+ }
1421
+ },
1422
+ {
1423
+ "config": {
1424
+ "model_name": "starling-lm-7b-beta",
1425
+ "organization": "Nexusflow",
1426
+ "license": "Apache-2.0",
1427
+ "knowledge_cutoff": "2024/03"
1428
+ },
1429
+ "results": {
1430
+ "OVERALL": {
1431
+ "Average Score": 43.0415265396966,
1432
+ "Standard Deviation": 0.8770524316858576,
1433
+ "Rank": 24
1434
+ },
1435
+ "Geometry": {
1436
+ "Average Score": 0.446654388,
1437
+ "Standard Deviation": 0.05637864999,
1438
+ "Rank": 28
1439
+ },
1440
+ "Algebra": {
1441
+ "Average Score": 0.473952749,
1442
+ "Standard Deviation": 0.01584301288,
1443
+ "Rank": 27
1444
+ },
1445
+ "Probability": {
1446
+ "Average Score": 41.320066911500234,
1447
+ "Standard Deviation": null,
1448
+ "Rank": 34
1449
+ },
1450
+ "Logical": {
1451
+ "Average Score": 39.79665241383638,
1452
+ "Standard Deviation": 3.4711628274016544,
1453
+ "Rank": 30
1454
+ },
1455
+ "Social": {
1456
+ "Average Score": 0.380021662,
1457
+ "Standard Deviation": 0.04622452748,
1458
+ "Rank": 25
1459
+ },
1460
+ "Chemistry": {
1461
+ "Average Score": 37.39896886078588,
1462
+ "Standard Deviation": null,
1463
+ "Rank": 36
1464
+ },
1465
+ "CPP": {
1466
+ "Average Score": 38.27587102395908,
1467
+ "Standard Deviation": null,
1468
+ "Rank": 32
1469
+ }
1470
+ }
1471
+ },
1472
+ {
1473
+ "config": {
1474
+ "model_name": "gemini-1.0-pro-001",
1475
+ "organization": "Google",
1476
+ "license": "Proprietary",
1477
+ "knowledge_cutoff": "2023/04"
1478
+ },
1479
+ "results": {
1480
+ "OVERALL": {
1481
+ "Average Score": 45.78126809517331,
1482
+ "Standard Deviation": 3.7275133674569783,
1483
+ "Rank": 23
1484
+ },
1485
+ "Geometry": {
1486
+ "Average Score": 0.578347959,
1487
+ "Standard Deviation": 0.04242873607,
1488
+ "Rank": 19
1489
+ },
1490
+ "Algebra": {
1491
+ "Average Score": 0.462417786,
1492
+ "Standard Deviation": 0.01668313635,
1493
+ "Rank": 28
1494
+ },
1495
+ "Probability": {
1496
+ "Average Score": 31.410607001114293,
1497
+ "Standard Deviation": null,
1498
+ "Rank": 42
1499
+ },
1500
+ "Logical": {
1501
+ "Average Score": 21.717362428653246,
1502
+ "Standard Deviation": 4.392290522642325,
1503
+ "Rank": 44
1504
+ },
1505
+ "Social": {
1506
+ "Average Score": 0.130790863,
1507
+ "Standard Deviation": 0.02800188173,
1508
+ "Rank": 45
1509
+ },
1510
+ "Chemistry": {
1511
+ "Average Score": 44.14314678087462,
1512
+ "Standard Deviation": null,
1513
+ "Rank": 27
1514
+ },
1515
+ "CPP": {
1516
+ "Average Score": 45.22204471452975,
1517
+ "Standard Deviation": null,
1518
+ "Rank": 23
1519
+ }
1520
+ }
1521
+ },
1522
+ {
1523
+ "config": {
1524
+ "model_name": "openchat-3.5-0106",
1525
+ "organization": "OpenChat",
1526
+ "license": "Apache-2.0",
1527
+ "knowledge_cutoff": "2024/01"
1528
+ },
1529
+ "results": {
1530
+ "OVERALL": {
1531
+ "Average Score": 23.85666609339201,
1532
+ "Standard Deviation": 1.341285455536348,
1533
+ "Rank": 36
1534
+ },
1535
+ "Geometry": {
1536
+ "Average Score": 0.38715246,
1537
+ "Standard Deviation": 0.03701851946,
1538
+ "Rank": 32
1539
+ },
1540
+ "Algebra": {
1541
+ "Average Score": 0.441233712,
1542
+ "Standard Deviation": 0.01135753754,
1543
+ "Rank": 29
1544
+ },
1545
+ "Probability": {
1546
+ "Average Score": 40.37790468557232,
1547
+ "Standard Deviation": null,
1548
+ "Rank": 36
1549
+ },
1550
+ "Logical": {
1551
+ "Average Score": 35.1573373260624,
1552
+ "Standard Deviation": 2.485128777146724,
1553
+ "Rank": 32
1554
+ },
1555
+ "Social": {
1556
+ "Average Score": 0.250891608,
1557
+ "Standard Deviation": 0.03253769914,
1558
+ "Rank": 36
1559
+ },
1560
+ "Chemistry": {
1561
+ "Average Score": 32.96322247853182,
1562
+ "Standard Deviation": null,
1563
+ "Rank": 37
1564
+ },
1565
+ "CPP": {
1566
+ "Average Score": 33.70639271807677,
1567
+ "Standard Deviation": null,
1568
+ "Rank": 33
1569
+ }
1570
+ }
1571
+ },
1572
+ {
1573
+ "config": {
1574
+ "model_name": "openchat-3.5",
1575
+ "organization": "OpenChat",
1576
+ "license": "Apache-2.0",
1577
+ "knowledge_cutoff": "2023/11"
1578
+ },
1579
+ "results": {
1580
+ "OVERALL": {
1581
+ "Average Score": 23.63538251797928,
1582
+ "Standard Deviation": 2.0516295921862095,
1583
+ "Rank": 38
1584
+ },
1585
+ "Geometry": {
1586
+ "Average Score": 0.401699069,
1587
+ "Standard Deviation": 0.03410726557,
1588
+ "Rank": 30
1589
+ },
1590
+ "Algebra": {
1591
+ "Average Score": 0.414095336,
1592
+ "Standard Deviation": 0.01881964261,
1593
+ "Rank": 31
1594
+ },
1595
+ "Probability": {
1596
+ "Average Score": 36.00454588244476,
1597
+ "Standard Deviation": null,
1598
+ "Rank": 38
1599
+ },
1600
+ "Logical": {
1601
+ "Average Score": 34.029859502735654,
1602
+ "Standard Deviation": 3.354098427500673,
1603
+ "Rank": 35
1604
+ },
1605
+ "Social": {
1606
+ "Average Score": 0.319991655,
1607
+ "Standard Deviation": 0.04502478724,
1608
+ "Rank": 29
1609
+ },
1610
+ "Chemistry": {
1611
+ "Average Score": 32.29778226319944,
1612
+ "Standard Deviation": null,
1613
+ "Rank": 38
1614
+ },
1615
+ "CPP": {
1616
+ "Average Score": 33.020911255646965,
1617
+ "Standard Deviation": null,
1618
+ "Rank": 34
1619
+ }
1620
+ }
1621
+ },
1622
+ {
1623
+ "config": {
1624
+ "model_name": "command-r-(08-2024)",
1625
+ "organization": "Cohere",
1626
+ "license": "CC-BY-NC-4.0",
1627
+ "knowledge_cutoff": "2024/08"
1628
+ },
1629
+ "results": {
1630
+ "OVERALL": {
1631
+ "Average Score": 38.783798277856995,
1632
+ "Standard Deviation": 1.1948096596199191,
1633
+ "Rank": 27
1634
+ },
1635
+ "Geometry": {
1636
+ "Average Score": 0.448300727,
1637
+ "Standard Deviation": 0.04996362328,
1638
+ "Rank": 27
1639
+ },
1640
+ "Algebra": {
1641
+ "Average Score": 0.417519167,
1642
+ "Standard Deviation": 0.01822196902,
1643
+ "Rank": 30
1644
+ },
1645
+ "Probability": {
1646
+ "Average Score": 38.019523941917335,
1647
+ "Standard Deviation": null,
1648
+ "Rank": 37
1649
+ },
1650
+ "Logical": {
1651
+ "Average Score": 23.408826179018206,
1652
+ "Standard Deviation": 0.9355701468205376,
1653
+ "Rank": 42
1654
+ },
1655
+ "Social": {
1656
+ "Average Score": 0.276088379,
1657
+ "Standard Deviation": 0.03295234688,
1658
+ "Rank": 32
1659
+ },
1660
+ "Chemistry": {
1661
+ "Average Score": 38.699171059988636,
1662
+ "Standard Deviation": null,
1663
+ "Rank": 34
1664
+ },
1665
+ "CPP": {
1666
+ "Average Score": 39.61492485677676,
1667
+ "Standard Deviation": null,
1668
+ "Rank": 30
1669
+ }
1670
+ }
1671
+ },
1672
+ {
1673
+ "config": {
1674
+ "model_name": "gemma-1.1-7b-it",
1675
+ "organization": "Google",
1676
+ "license": "Gemma License",
1677
+ "knowledge_cutoff": "2024/02"
1678
+ },
1679
+ "results": {
1680
+ "OVERALL": {
1681
+ "Average Score": 20.965269549151657,
1682
+ "Standard Deviation": 0.6031600560715249,
1683
+ "Rank": 39
1684
+ },
1685
+ "Geometry": {
1686
+ "Average Score": 0.324170977,
1687
+ "Standard Deviation": 0.04668553765,
1688
+ "Rank": 35
1689
+ },
1690
+ "Algebra": {
1691
+ "Average Score": 0.398684697,
1692
+ "Standard Deviation": 0.01982398259,
1693
+ "Rank": 32
1694
+ },
1695
+ "Probability": {
1696
+ "Average Score": 30.98345832281905,
1697
+ "Standard Deviation": null,
1698
+ "Rank": 43
1699
+ },
1700
+ "Logical": {
1701
+ "Average Score": 33.36570116785516,
1702
+ "Standard Deviation": 3.8824795120929765,
1703
+ "Rank": 36
1704
+ },
1705
+ "Social": {
1706
+ "Average Score": 0.179073276,
1707
+ "Standard Deviation": 0.02009658805,
1708
+ "Rank": 41
1709
+ },
1710
+ "Chemistry": {
1711
+ "Average Score": 41.66173653808921,
1712
+ "Standard Deviation": null,
1713
+ "Rank": 31
1714
+ },
1715
+ "CPP": {
1716
+ "Average Score": 42.666504105798204,
1717
+ "Standard Deviation": null,
1718
+ "Rank": 27
1719
+ }
1720
+ }
1721
+ },
1722
+ {
1723
+ "config": {
1724
+ "model_name": "llama3-8b-instruct",
1725
+ "organization": "Meta",
1726
+ "license": "Llama 3 Community",
1727
+ "knowledge_cutoff": "2023/03"
1728
+ },
1729
+ "results": {
1730
+ "OVERALL": {
1731
+ "Average Score": 30.183633696164936,
1732
+ "Standard Deviation": 3.5901082045571266,
1733
+ "Rank": 31
1734
+ },
1735
+ "Geometry": {
1736
+ "Average Score": 0.367143758,
1737
+ "Standard Deviation": 0.04363680358,
1738
+ "Rank": 33
1739
+ },
1740
+ "Algebra": {
1741
+ "Average Score": 0.391480973,
1742
+ "Standard Deviation": 0.02757445266,
1743
+ "Rank": 33
1744
+ },
1745
+ "Probability": {
1746
+ "Average Score": 34.51621975866105,
1747
+ "Standard Deviation": null,
1748
+ "Rank": 39
1749
+ },
1750
+ "Logical": {
1751
+ "Average Score": 45.27560737491475,
1752
+ "Standard Deviation": 4.639305724878496,
1753
+ "Rank": 26
1754
+ },
1755
+ "Social": {
1756
+ "Average Score": 0.336373622,
1757
+ "Standard Deviation": 0.05762408512,
1758
+ "Rank": 26
1759
+ },
1760
+ "Chemistry": {
1761
+ "Average Score": 44.271144265487514,
1762
+ "Standard Deviation": null,
1763
+ "Rank": 26
1764
+ },
1765
+ "CPP": {
1766
+ "Average Score": 45.35392139264795,
1767
+ "Standard Deviation": null,
1768
+ "Rank": 22
1769
+ }
1770
+ }
1771
+ },
1772
+ {
1773
+ "config": {
1774
+ "model_name": "gemma-2-2b-it",
1775
+ "organization": "Google",
1776
+ "license": "Gemma License",
1777
+ "knowledge_cutoff": "2024/07"
1778
+ },
1779
+ "results": {
1780
+ "OVERALL": {
1781
+ "Average Score": 47.37377937645159,
1782
+ "Standard Deviation": 2.72420190928707,
1783
+ "Rank": 22
1784
+ },
1785
+ "Geometry": {
1786
+ "Average Score": 0.395006676,
1787
+ "Standard Deviation": 0.05882607713,
1788
+ "Rank": 31
1789
+ },
1790
+ "Algebra": {
1791
+ "Average Score": 0.379391887,
1792
+ "Standard Deviation": 0.01722410785,
1793
+ "Rank": 34
1794
+ },
1795
+ "Probability": {
1796
+ "Average Score": 33.90530403382374,
1797
+ "Standard Deviation": null,
1798
+ "Rank": 41
1799
+ },
1800
+ "Logical": {
1801
+ "Average Score": 37.64262561604027,
1802
+ "Standard Deviation": 3.0627256408495804,
1803
+ "Rank": 31
1804
+ },
1805
+ "Social": {
1806
+ "Average Score": 0.393482094,
1807
+ "Standard Deviation": 0.06450214024,
1808
+ "Rank": 23
1809
+ },
1810
+ "Chemistry": {
1811
+ "Average Score": 29.883648650177584,
1812
+ "Standard Deviation": null,
1813
+ "Rank": 40
1814
+ },
1815
+ "CPP": {
1816
+ "Average Score": 30.53406933106768,
1817
+ "Standard Deviation": null,
1818
+ "Rank": 36
1819
+ }
1820
+ }
1821
+ },
1822
+ {
1823
+ "config": {
1824
+ "model_name": "starling-lm-7b-alpha",
1825
+ "organization": "Nexusflow",
1826
+ "license": "Apache-2.0",
1827
+ "knowledge_cutoff": "2023/11"
1828
+ },
1829
+ "results": {
1830
+ "OVERALL": {
1831
+ "Average Score": 24.34505731078066,
1832
+ "Standard Deviation": 1.4660872513914562,
1833
+ "Rank": 35
1834
+ },
1835
+ "Geometry": {
1836
+ "Average Score": 0.336782578,
1837
+ "Standard Deviation": 0.04069449132,
1838
+ "Rank": 34
1839
+ },
1840
+ "Algebra": {
1841
+ "Average Score": 0.371551932,
1842
+ "Standard Deviation": 0.03367241745,
1843
+ "Rank": 35
1844
+ },
1845
+ "Probability": {
1846
+ "Average Score": 34.51613212227484,
1847
+ "Standard Deviation": null,
1848
+ "Rank": 40
1849
+ },
1850
+ "Logical": {
1851
+ "Average Score": 29.88612695085449,
1852
+ "Standard Deviation": 2.4070524024678672,
1853
+ "Rank": 40
1854
+ },
1855
+ "Social": {
1856
+ "Average Score": 0.271975534,
1857
+ "Standard Deviation": 0.04266753408,
1858
+ "Rank": 33
1859
+ },
1860
+ "Chemistry": {
1861
+ "Average Score": 29.442057363491365,
1862
+ "Standard Deviation": null,
1863
+ "Rank": 41
1864
+ },
1865
+ "CPP": {
1866
+ "Average Score": 30.07926487356878,
1867
+ "Standard Deviation": null,
1868
+ "Rank": 37
1869
+ }
1870
+ }
1871
+ },
1872
+ {
1873
+ "config": {
1874
+ "model_name": "qwen1.5-4b-chat",
1875
+ "organization": "Alibaba",
1876
+ "license": "Qianwen LICENSE",
1877
+ "knowledge_cutoff": "2024/02"
1878
+ },
1879
+ "results": {
1880
+ "OVERALL": {
1881
+ "Average Score": 7.19753150259024,
1882
+ "Standard Deviation": 0.6175113365944395,
1883
+ "Rank": 52
1884
+ },
1885
+ "Geometry": {
1886
+ "Average Score": 0.215834522,
1887
+ "Standard Deviation": 0.0363766363,
1888
+ "Rank": 39
1889
+ },
1890
+ "Algebra": {
1891
+ "Average Score": 0.305589811,
1892
+ "Standard Deviation": 0.02354198912,
1893
+ "Rank": 36
1894
+ },
1895
+ "Probability": {
1896
+ "Average Score": 15.124506890648007,
1897
+ "Standard Deviation": null,
1898
+ "Rank": 49
1899
+ },
1900
+ "Logical": {
1901
+ "Average Score": 11.67206257803879,
1902
+ "Standard Deviation": 1.140401009846497,
1903
+ "Rank": 51
1904
+ },
1905
+ "Social": {
1906
+ "Average Score": 0.18195615,
1907
+ "Standard Deviation": 0.02269805277,
1908
+ "Rank": 40
1909
+ },
1910
+ "Chemistry": {
1911
+ "Average Score": 12.825435835657133,
1912
+ "Standard Deviation": null,
1913
+ "Rank": 52
1914
+ },
1915
+ "CPP": {
1916
+ "Average Score": 13.21208067122554,
1917
+ "Standard Deviation": null,
1918
+ "Rank": 47
1919
+ }
1920
+ }
1921
+ },
1922
+ {
1923
+ "config": {
1924
+ "model_name": "command-r-(04-2024)",
1925
+ "organization": "Cohere",
1926
+ "license": "CC-BY-NC-4.0",
1927
+ "knowledge_cutoff": "2024/04"
1928
+ },
1929
+ "results": {
1930
+ "OVERALL": {
1931
+ "Average Score": 26.20787727166716,
1932
+ "Standard Deviation": 1.6793980036057201,
1933
+ "Rank": 34
1934
+ },
1935
+ "Geometry": {
1936
+ "Average Score": 0.300416698,
1937
+ "Standard Deviation": 0.03485612736,
1938
+ "Rank": 36
1939
+ },
1940
+ "Algebra": {
1941
+ "Average Score": 0.293120231,
1942
+ "Standard Deviation": 0.032926484,
1943
+ "Rank": 37
1944
+ },
1945
+ "Probability": {
1946
+ "Average Score": 28.551833516483626,
1947
+ "Standard Deviation": null,
1948
+ "Rank": 44
1949
+ },
1950
+ "Logical": {
1951
+ "Average Score": 30.83782425033377,
1952
+ "Standard Deviation": 3.4266833154577383,
1953
+ "Rank": 39
1954
+ },
1955
+ "Social": {
1956
+ "Average Score": 0.283882949,
1957
+ "Standard Deviation": 0.03336901148,
1958
+ "Rank": 31
1959
+ },
1960
+ "Chemistry": {
1961
+ "Average Score": 40.38004181614496,
1962
+ "Standard Deviation": null,
1963
+ "Rank": 32
1964
+ },
1965
+ "CPP": {
1966
+ "Average Score": 41.346336503003236,
1967
+ "Standard Deviation": null,
1968
+ "Rank": 28
1969
+ }
1970
+ }
1971
+ },
1972
+ {
1973
+ "config": {
1974
+ "model_name": "vicuna-33b",
1975
+ "organization": "LMSYS",
1976
+ "license": "Non-commercial",
1977
+ "knowledge_cutoff": "2023/08"
1978
+ },
1979
+ "results": {
1980
+ "OVERALL": {
1981
+ "Average Score": 19.726298678709266,
1982
+ "Standard Deviation": 1.0771354692793496,
1983
+ "Rank": 41
1984
+ },
1985
+ "Geometry": {
1986
+ "Average Score": 0.208284679,
1987
+ "Standard Deviation": 0.03937771461,
1988
+ "Rank": 40
1989
+ },
1990
+ "Algebra": {
1991
+ "Average Score": 0.248994048,
1992
+ "Standard Deviation": 0.02668175054,
1993
+ "Rank": 39
1994
+ },
1995
+ "Probability": {
1996
+ "Average Score": 23.2308538772627,
1997
+ "Standard Deviation": null,
1998
+ "Rank": 47
1999
+ },
2000
+ "Logical": {
2001
+ "Average Score": 19.488409585540122,
2002
+ "Standard Deviation": 0.7913465863319494,
2003
+ "Rank": 46
2004
+ },
2005
+ "Social": {
2006
+ "Average Score": 0.257623798,
2007
+ "Standard Deviation": 0.02653724437,
2008
+ "Rank": 34
2009
+ },
2010
+ "Chemistry": {
2011
+ "Average Score": 27.198874596635843,
2012
+ "Standard Deviation": null,
2013
+ "Rank": 43
2014
+ },
2015
+ "CPP": {
2016
+ "Average Score": 28.01838653090379,
2017
+ "Standard Deviation": null,
2018
+ "Rank": 38
2019
+ }
2020
+ }
2021
+ },
2022
+ {
2023
+ "config": {
2024
+ "model_name": "gemma-7b-it",
2025
+ "organization": "Google",
2026
+ "license": "Gemma License",
2027
+ "knowledge_cutoff": "2024/02"
2028
+ },
2029
+ "results": {
2030
+ "OVERALL": {
2031
+ "Average Score": 18.339626858215343,
2032
+ "Standard Deviation": 0.1553156123023995,
2033
+ "Rank": 43
2034
+ },
2035
+ "Geometry": {
2036
+ "Average Score": 0.244791417,
2037
+ "Standard Deviation": 0.0289612078,
2038
+ "Rank": 37
2039
+ },
2040
+ "Algebra": {
2041
+ "Average Score": 0.250614794,
2042
+ "Standard Deviation": 0.01991678295,
2043
+ "Rank": 38
2044
+ },
2045
+ "Probability": {
2046
+ "Average Score": 18.066869704202595,
2047
+ "Standard Deviation": null,
2048
+ "Rank": 48
2049
+ },
2050
+ "Logical": {
2051
+ "Average Score": 22.446113532575186,
2052
+ "Standard Deviation": 1.1759308097806727,
2053
+ "Rank": 43
2054
+ },
2055
+ "Social": {
2056
+ "Average Score": 0.202138025,
2057
+ "Standard Deviation": 0.02098346639,
2058
+ "Rank": 39
2059
+ },
2060
+ "Chemistry": {
2061
+ "Average Score": 27.195166540671735,
2062
+ "Standard Deviation": null,
2063
+ "Rank": 43
2064
+ },
2065
+ "CPP": {
2066
+ "Average Score": 28.014658234926813,
2067
+ "Standard Deviation": null,
2068
+ "Rank": 39
2069
+ }
2070
+ }
2071
+ },
2072
+ {
2073
+ "config": {
2074
+ "model_name": "mistral-7b-instruct-2",
2075
+ "organization": "Mistral",
2076
+ "license": "Apache 2.0",
2077
+ "knowledge_cutoff": "2023/12"
2078
+ },
2079
+ "results": {
2080
+ "OVERALL": {
2081
+ "Average Score": 32.27919528900069,
2082
+ "Standard Deviation": 2.070593349377193,
2083
+ "Rank": 28
2084
+ },
2085
+ "Geometry": {
2086
+ "Average Score": 0.216402626,
2087
+ "Standard Deviation": 0.03338414918,
2088
+ "Rank": 38
2089
+ },
2090
+ "Algebra": {
2091
+ "Average Score": 0.233777838,
2092
+ "Standard Deviation": 0.0155226054,
2093
+ "Rank": 40
2094
+ },
2095
+ "Probability": {
2096
+ "Average Score": 25.70261650740474,
2097
+ "Standard Deviation": null,
2098
+ "Rank": 45
2099
+ },
2100
+ "Logical": {
2101
+ "Average Score": 26.165635051797608,
2102
+ "Standard Deviation": 1.5009510944001014,
2103
+ "Rank": 41
2104
+ },
2105
+ "Social": {
2106
+ "Average Score": 0.209386782,
2107
+ "Standard Deviation": 0.02738569921,
2108
+ "Rank": 38
2109
+ },
2110
+ "Chemistry": {
2111
+ "Average Score": 30.70773868184025,
2112
+ "Standard Deviation": null,
2113
+ "Rank": 39
2114
+ },
2115
+ "CPP": {
2116
+ "Average Score": 31.382959631870822,
2117
+ "Standard Deviation": null,
2118
+ "Rank": 35
2119
+ }
2120
+ }
2121
+ },
2122
+ {
2123
+ "config": {
2124
+ "model_name": "mistral-7b-instruct-1",
2125
+ "organization": "Mistral",
2126
+ "license": "Apache 2.0",
2127
+ "knowledge_cutoff": "2023/12"
2128
+ },
2129
+ "results": {
2130
+ "OVERALL": {
2131
+ "Average Score": 14.750363553682964,
2132
+ "Standard Deviation": 0.442399072321264,
2133
+ "Rank": 48
2134
+ },
2135
+ "Geometry": {
2136
+ "Average Score": 0.161799938,
2137
+ "Standard Deviation": 0.03595278559,
2138
+ "Rank": 44
2139
+ },
2140
+ "Algebra": {
2141
+ "Average Score": 0.210341624,
2142
+ "Standard Deviation": 0.01736539119,
2143
+ "Rank": 41
2144
+ },
2145
+ "Probability": {
2146
+ "Average Score": 24.69501890202338,
2147
+ "Standard Deviation": null,
2148
+ "Rank": 46
2149
+ },
2150
+ "Logical": {
2151
+ "Average Score": 15.957706802740889,
2152
+ "Standard Deviation": 2.080778273455708,
2153
+ "Rank": 50
2154
+ },
2155
+ "Social": {
2156
+ "Average Score": 0.117646827,
2157
+ "Standard Deviation": 0.009321202779,
2158
+ "Rank": 47
2159
+ },
2160
+ "Chemistry": {
2161
+ "Average Score": 18.375111202411667,
2162
+ "Standard Deviation": null,
2163
+ "Rank": 47
2164
+ },
2165
+ "CPP": {
2166
+ "Average Score": 18.929093202755805,
2167
+ "Standard Deviation": null,
2168
+ "Rank": 42
2169
+ }
2170
+ }
2171
+ },
2172
+ {
2173
+ "config": {
2174
+ "model_name": "vicuna-13b",
2175
+ "organization": "LMSYS",
2176
+ "license": "Non-commercial",
2177
+ "knowledge_cutoff": "2023/07"
2178
+ },
2179
+ "results": {
2180
+ "OVERALL": {
2181
+ "Average Score": 13.302607436757697,
2182
+ "Standard Deviation": 0.570272227659312,
2183
+ "Rank": 50
2184
+ },
2185
+ "Geometry": {
2186
+ "Average Score": 0.200941928,
2187
+ "Standard Deviation": 0.03366817781,
2188
+ "Rank": 41
2189
+ },
2190
+ "Algebra": {
2191
+ "Average Score": 0.196123323,
2192
+ "Standard Deviation": 0.0135715643,
2193
+ "Rank": 42
2194
+ },
2195
+ "Probability": {
2196
+ "Average Score": 15.08476669604627,
2197
+ "Standard Deviation": null,
2198
+ "Rank": 50
2199
+ },
2200
+ "Logical": {
2201
+ "Average Score": 16.548339412104294,
2202
+ "Standard Deviation": 3.443370777556759,
2203
+ "Rank": 49
2204
+ },
2205
+ "Social": {
2206
+ "Average Score": 0.124655135,
2207
+ "Standard Deviation": 0.01122382671,
2208
+ "Rank": 46
2209
+ },
2210
+ "Chemistry": {
2211
+ "Average Score": 21.201173318496842,
2212
+ "Standard Deviation": null,
2213
+ "Rank": 45
2214
+ },
2215
+ "CPP": {
2216
+ "Average Score": 21.840013221590294,
2217
+ "Standard Deviation": null,
2218
+ "Rank": 40
2219
+ }
2220
+ }
2221
+ },
2222
+ {
2223
+ "config": {
2224
+ "model_name": "zephyr-7b-beta",
2225
+ "organization": "HuggingFace",
2226
+ "license": "MIT",
2227
+ "knowledge_cutoff": "2023/10"
2228
+ },
2229
+ "results": {
2230
+ "OVERALL": {
2231
+ "Average Score": 7.378234886105356,
2232
+ "Standard Deviation": 1.1456147261693999,
2233
+ "Rank": 51
2234
+ },
2235
+ "Geometry": {
2236
+ "Average Score": 0.114005544,
2237
+ "Standard Deviation": 0.03144354365,
2238
+ "Rank": 45
2239
+ },
2240
+ "Algebra": {
2241
+ "Average Score": 0.141766633,
2242
+ "Standard Deviation": 0.03179520129,
2243
+ "Rank": 43
2244
+ },
2245
+ "Probability": {
2246
+ "Average Score": 8.92696070171298,
2247
+ "Standard Deviation": null,
2248
+ "Rank": 53
2249
+ },
2250
+ "Logical": {
2251
+ "Average Score": 6.971377981442089,
2252
+ "Standard Deviation": 0.31669853263737413,
2253
+ "Rank": 55
2254
+ },
2255
+ "Social": {
2256
+ "Average Score": 0.0,
2257
+ "Standard Deviation": 0.0,
2258
+ "Rank": 52
2259
+ },
2260
+ "Chemistry": {
2261
+ "Average Score": 18.374948840997902,
2262
+ "Standard Deviation": null,
2263
+ "Rank": 47
2264
+ },
2265
+ "CPP": {
2266
+ "Average Score": 18.92902220864132,
2267
+ "Standard Deviation": null,
2268
+ "Rank": 43
2269
+ }
2270
+ }
2271
+ },
2272
+ {
2273
+ "config": {
2274
+ "model_name": "gemma-1.1-2b-it",
2275
+ "organization": "Google",
2276
+ "license": "Gemma License",
2277
+ "knowledge_cutoff": "2024/02"
2278
+ },
2279
+ "results": {
2280
+ "OVERALL": {
2281
+ "Average Score": 16.083251992757752,
2282
+ "Standard Deviation": 0.7340624884005772,
2283
+ "Rank": 46
2284
+ },
2285
+ "Geometry": {
2286
+ "Average Score": 0.183974034,
2287
+ "Standard Deviation": 0.0215548886,
2288
+ "Rank": 43
2289
+ },
2290
+ "Algebra": {
2291
+ "Average Score": 0.13422252,
2292
+ "Standard Deviation": 0.01922819511,
2293
+ "Rank": 44
2294
+ },
2295
+ "Probability": {
2296
+ "Average Score": 9.992136776217318,
2297
+ "Standard Deviation": null,
2298
+ "Rank": 52
2299
+ },
2300
+ "Logical": {
2301
+ "Average Score": 9.537233946101678,
2302
+ "Standard Deviation": 0.7567112693269967,
2303
+ "Rank": 53
2304
+ },
2305
+ "Social": {
2306
+ "Average Score": 0.167796727,
2307
+ "Standard Deviation": 0.01666541942,
2308
+ "Rank": 42
2309
+ },
2310
+ "Chemistry": {
2311
+ "Average Score": 20.11834233400297,
2312
+ "Standard Deviation": null,
2313
+ "Rank": 46
2314
+ },
2315
+ "CPP": {
2316
+ "Average Score": 20.724691953843916,
2317
+ "Standard Deviation": null,
2318
+ "Rank": 41
2319
+ }
2320
+ }
2321
+ },
2322
+ {
2323
+ "config": {
2324
+ "model_name": "llama2-7b-chat",
2325
+ "organization": "Meta",
2326
+ "license": "Llama 2 Community",
2327
+ "knowledge_cutoff": "2023/07"
2328
+ },
2329
+ "results": {
2330
+ "OVERALL": {
2331
+ "Average Score": 17.319161859655946,
2332
+ "Standard Deviation": 0.495520710612214,
2333
+ "Rank": 45
2334
+ },
2335
+ "Geometry": {
2336
+ "Average Score": 0.087067276,
2337
+ "Standard Deviation": 0.04274343402,
2338
+ "Rank": 46
2339
+ },
2340
+ "Algebra": {
2341
+ "Average Score": 0.12308805,
2342
+ "Standard Deviation": 0.01856053622,
2343
+ "Rank": 45
2344
+ },
2345
+ "Probability": {
2346
+ "Average Score": 8.860911732515305,
2347
+ "Standard Deviation": null,
2348
+ "Rank": 54
2349
+ },
2350
+ "Logical": {
2351
+ "Average Score": 18.812132126028335,
2352
+ "Standard Deviation": 3.0846832107977433,
2353
+ "Rank": 47
2354
+ },
2355
+ "Social": {
2356
+ "Average Score": 0.152905272,
2357
+ "Standard Deviation": 0.007166957097,
2358
+ "Rank": 43
2359
+ },
2360
+ "Chemistry": {
2361
+ "Average Score": 15.270334671133512,
2362
+ "Standard Deviation": null,
2363
+ "Rank": 50
2364
+ },
2365
+ "CPP": {
2366
+ "Average Score": 15.730513733660898,
2367
+ "Standard Deviation": null,
2368
+ "Rank": 45
2369
+ }
2370
+ }
2371
+ },
2372
+ {
2373
+ "config": {
2374
+ "model_name": "gemma-2b-it",
2375
+ "organization": "Google",
2376
+ "license": "Gemma License",
2377
+ "knowledge_cutoff": "2024/02"
2378
+ },
2379
+ "results": {
2380
+ "OVERALL": {
2381
+ "Average Score": 15.029602991101632,
2382
+ "Standard Deviation": 0.4529017602377039,
2383
+ "Rank": 47
2384
+ },
2385
+ "Geometry": {
2386
+ "Average Score": 0.198571153,
2387
+ "Standard Deviation": 0.01699161031,
2388
+ "Rank": 42
2389
+ },
2390
+ "Algebra": {
2391
+ "Average Score": 0.109883009,
2392
+ "Standard Deviation": 0.01520005833,
2393
+ "Rank": 46
2394
+ },
2395
+ "Probability": {
2396
+ "Average Score": 6.561360414966015,
2397
+ "Standard Deviation": null,
2398
+ "Rank": 56
2399
+ },
2400
+ "Logical": {
2401
+ "Average Score": 3.9858662356708785,
2402
+ "Standard Deviation": 0.5609499073366407,
2403
+ "Rank": 56
2404
+ },
2405
+ "Social": {
2406
+ "Average Score": 0.087452913,
2407
+ "Standard Deviation": 0.008170146562,
2408
+ "Rank": 50
2409
+ },
2410
+ "Chemistry": {
2411
+ "Average Score": 16.766144078336097,
2412
+ "Standard Deviation": null,
2413
+ "Rank": 49
2414
+ },
2415
+ "CPP": {
2416
+ "Average Score": 17.2715657115764,
2417
+ "Standard Deviation": null,
2418
+ "Rank": 44
2419
+ }
2420
+ }
2421
+ },
2422
+ {
2423
+ "config": {
2424
+ "model_name": "llama2-13b-chat",
2425
+ "organization": "Meta",
2426
+ "license": "Llama 2 Community",
2427
+ "knowledge_cutoff": "2023/07"
2428
+ },
2429
+ "results": {
2430
+ "OVERALL": {
2431
+ "Average Score": 17.47902371074294,
2432
+ "Standard Deviation": 0.4047581815962028,
2433
+ "Rank": 44
2434
+ },
2435
+ "Geometry": {
2436
+ "Average Score": 0.072729954,
2437
+ "Standard Deviation": 0.02315988261,
2438
+ "Rank": 48
2439
+ },
2440
+ "Algebra": {
2441
+ "Average Score": 0.080371692,
2442
+ "Standard Deviation": 0.01277569453,
2443
+ "Rank": 47
2444
+ },
2445
+ "Probability": {
2446
+ "Average Score": 12.738302754764042,
2447
+ "Standard Deviation": null,
2448
+ "Rank": 51
2449
+ },
2450
+ "Logical": {
2451
+ "Average Score": 21.708359515217182,
2452
+ "Standard Deviation": 1.4862481594434973,
2453
+ "Rank": 45
2454
+ },
2455
+ "Social": {
2456
+ "Average Score": 0.149125922,
2457
+ "Standard Deviation": 0.01157416827,
2458
+ "Rank": 44
2459
+ },
2460
+ "Chemistry": {
2461
+ "Average Score": 12.786967781868814,
2462
+ "Standard Deviation": null,
2463
+ "Rank": 53
2464
+ },
2465
+ "CPP": {
2466
+ "Average Score": 13.17258252933903,
2467
+ "Standard Deviation": null,
2468
+ "Rank": 48
2469
+ }
2470
+ }
2471
+ },
2472
+ {
2473
+ "config": {
2474
+ "model_name": "vicuna-7b",
2475
+ "organization": "LMSYS",
2476
+ "license": "Non-commercial",
2477
+ "knowledge_cutoff": "2023/07"
2478
+ },
2479
+ "results": {
2480
+ "OVERALL": {
2481
+ "Average Score": 13.31896682669754,
2482
+ "Standard Deviation": 0.30441157156016124,
2483
+ "Rank": 49
2484
+ },
2485
+ "Geometry": {
2486
+ "Average Score": 0.083457058,
2487
+ "Standard Deviation": 0.02520989111,
2488
+ "Rank": 47
2489
+ },
2490
+ "Algebra": {
2491
+ "Average Score": 0.070883882,
2492
+ "Standard Deviation": 0.007315853253,
2493
+ "Rank": 48
2494
+ },
2495
+ "Probability": {
2496
+ "Average Score": 8.255246380068842,
2497
+ "Standard Deviation": null,
2498
+ "Rank": 55
2499
+ },
2500
+ "Logical": {
2501
+ "Average Score": 10.046676845257544,
2502
+ "Standard Deviation": 0.6816182835206797,
2503
+ "Rank": 52
2504
+ },
2505
+ "Social": {
2506
+ "Average Score": 0.111076414,
2507
+ "Standard Deviation": 0.004805626512,
2508
+ "Rank": 48
2509
+ },
2510
+ "Chemistry": {
2511
+ "Average Score": 13.838150481781991,
2512
+ "Standard Deviation": null,
2513
+ "Rank": 51
2514
+ },
2515
+ "CPP": {
2516
+ "Average Score": 14.255194156624162,
2517
+ "Standard Deviation": null,
2518
+ "Rank": 46
2519
+ }
2520
+ }
2521
+ },
2522
+ {
2523
+ "config": {
2524
+ "model_name": "koala-13b",
2525
+ "organization": "UC Berkeley",
2526
+ "license": "Non-commercial",
2527
+ "knowledge_cutoff": "2023/04"
2528
+ },
2529
+ "results": {
2530
+ "OVERALL": {
2531
+ "Average Score": 6.419305623111718,
2532
+ "Standard Deviation": 0.19611070515647736,
2533
+ "Rank": 53
2534
+ },
2535
+ "Geometry": {
2536
+ "Average Score": 0.017374001,
2537
+ "Standard Deviation": 0.01747053557,
2538
+ "Rank": 49
2539
+ },
2540
+ "Algebra": {
2541
+ "Average Score": 0.018129197,
2542
+ "Standard Deviation": 0.01054371383,
2543
+ "Rank": 49
2544
+ },
2545
+ "Probability": {
2546
+ "Average Score": 4.1717283559090035,
2547
+ "Standard Deviation": null,
2548
+ "Rank": 57
2549
+ },
2550
+ "Logical": {
2551
+ "Average Score": 7.484701131693112,
2552
+ "Standard Deviation": 0.172417770163525,
2553
+ "Rank": 54
2554
+ },
2555
+ "Social": {
2556
+ "Average Score": 0.096983835,
2557
+ "Standard Deviation": 0.007847059783,
2558
+ "Rank": 49
2559
+ },
2560
+ "Chemistry": {
2561
+ "Average Score": 6.177985738164252,
2562
+ "Standard Deviation": null,
2563
+ "Rank": 54
2564
+ },
2565
+ "CPP": {
2566
+ "Average Score": 6.36433272373514,
2567
+ "Standard Deviation": null,
2568
+ "Rank": 49
2569
+ }
2570
+ }
2571
+ },
2572
+ {
2573
+ "config": {
2574
+ "model_name": "openassistant-pythia-12b",
2575
+ "organization": "OpenAssistant",
2576
+ "license": "Non-commercial",
2577
+ "knowledge_cutoff": "2023/04"
2578
+ },
2579
+ "results": {
2580
+ "OVERALL": {
2581
+ "Average Score": 0.0,
2582
+ "Standard Deviation": 0.0,
2583
+ "Rank": 54
2584
+ },
2585
+ "Geometry": {
2586
+ "Average Score": 0.0,
2587
+ "Standard Deviation": 0.0,
2588
+ "Rank": 50
2589
+ },
2590
+ "Algebra": {
2591
+ "Average Score": 0.0,
2592
+ "Standard Deviation": 0.0,
2593
+ "Rank": 50
2594
+ },
2595
+ "Probability": {
2596
+ "Average Score": 0.0,
2597
+ "Standard Deviation": null,
2598
+ "Rank": 58
2599
+ },
2600
+ "Logical": {
2601
+ "Average Score": 0.0,
2602
+ "Standard Deviation": 0.0,
2603
+ "Rank": 57
2604
+ },
2605
+ "Social": {
2606
+ "Average Score": 0.030792528,
2607
+ "Standard Deviation": 0.007518796391,
2608
+ "Rank": 51
2609
+ },
2610
+ "Chemistry": {
2611
+ "Average Score": 0.0,
2612
+ "Standard Deviation": null,
2613
+ "Rank": 55
2614
+ },
2615
+ "CPP": {
2616
+ "Average Score": 0.0,
2617
+ "Standard Deviation": null,
2618
+ "Rank": 50
2619
+ }
2620
+ }
2621
+ },
2622
+ {
2623
+ "config": {
2624
+ "model_name": "nemotron-70b",
2625
+ "organization": "NVIDIA",
2626
+ "license": "Unknown",
2627
+ "knowledge_cutoff": "Unknown"
2628
+ },
2629
+ "results": {
2630
+ "OVERALL": {
2631
+ "Average Score": 100.0,
2632
+ "Standard Deviation": 0.0,
2633
+ "Rank": 1
2634
+ },
2635
+ "Chemistry": {
2636
+ "Average Score": 96.00601450276388,
2637
+ "Standard Deviation": null,
2638
+ "Rank": 3
2639
+ },
2640
+ "Logical": {
2641
+ "Average Score": 98.08807085219765,
2642
+ "Standard Deviation": 0.832489959144682,
2643
+ "Rank": 5
2644
+ },
2645
+ "Probability": {
2646
+ "Average Score": 91.16755514126538,
2647
+ "Standard Deviation": null,
2648
+ "Rank": 4
2649
+ }
2650
+ }
2651
+ },
2652
+ {
2653
+ "config": {
2654
+ "model_name": "llama-3.2-3b-it",
2655
+ "organization": "Meta",
2656
+ "license": "Llama 3 Community",
2657
+ "knowledge_cutoff": "Unknown"
2658
+ },
2659
+ "results": {
2660
+ "OVERALL": {
2661
+ "Average Score": 30.40742747938681,
2662
+ "Standard Deviation": 1.6816556668351852,
2663
+ "Rank": 30
2664
+ },
2665
+ "Chemistry": {
2666
+ "Average Score": 27.43049468475638,
2667
+ "Standard Deviation": null,
2668
+ "Rank": 42
2669
+ },
2670
+ "Logical": {
2671
+ "Average Score": 41.58905844173492,
2672
+ "Standard Deviation": 5.2798221527591,
2673
+ "Rank": 29
2674
+ },
2675
+ "Probability": {
2676
+ "Average Score": 62.02868227997844,
2677
+ "Standard Deviation": null,
2678
+ "Rank": 18
2679
+ }
2680
+ }
2681
+ },
2682
+ {
2683
+ "config": {
2684
+ "model_name": "yi-lightning",
2685
+ "organization": "01 AI",
2686
+ "license": "Proprietary",
2687
+ "knowledge_cutoff": "Unknown"
2688
+ },
2689
+ "results": {
2690
+ "Chemistry": {
2691
+ "Average Score": 100.0,
2692
+ "Standard Deviation": null,
2693
+ "Rank": 1
2694
+ },
2695
+ "Logical": {
2696
+ "Average Score": 98.816765663456,
2697
+ "Standard Deviation": 0.3271335810663529,
2698
+ "Rank": 3
2699
+ },
2700
+ "Probability": {
2701
+ "Average Score": 95.8842044402052,
2702
+ "Standard Deviation": null,
2703
+ "Rank": 2
2704
+ }
2705
+ }
2706
+ },
2707
+ {
2708
+ "config": {
2709
+ "model_name": "glm-4-plus",
2710
+ "organization": "Zhipu AI",
2711
+ "license": "Proprietary",
2712
+ "knowledge_cutoff": "Unknown"
2713
+ },
2714
+ "results": {
2715
+ "Chemistry": {
2716
+ "Average Score": 99.05822908668402,
2717
+ "Standard Deviation": null,
2718
+ "Rank": 2
2719
+ },
2720
+ "Logical": {
2721
+ "Average Score": 99.45307787995229,
2722
+ "Standard Deviation": 0.5982476107949444,
2723
+ "Rank": 1
2724
+ },
2725
+ "Probability": {
2726
+ "Average Score": 92.04426702796823,
2727
+ "Standard Deviation": null,
2728
+ "Rank": 3
2729
+ }
2730
+ }
2731
+ }
2732
+ ]