Spaces:

LLM360
/

de-arena

Running

App Files Files Community

yzabc007 commited on 15 days ago

Commit

8ef75a7

•

1 Parent(s): 92d7d3c

update

Browse files

Files changed (4) hide show

app.py +71 -28
src/display/utils.py +26 -13
src/populate.py +26 -14
src/results/models_2024-10-20-23:34:57.242641.json +2802 -0

app.py CHANGED Viewed

@@ -105,8 +105,9 @@ def init_leaderboard(dataframe):
 # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
 # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
 # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
-model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
 # model_result_path = "./src/results/models_2024-10-18-14:06:13.588399.json"
 # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
@@ -156,7 +157,7 @@ with demo:
         with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
             DESCRIPTION_TEXT = """
-            Total #models: 53 (Last updated: 2024-10-09)
             This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
             (Missing values are due to the slow or problemtic model responses to be fixed soom.)
@@ -182,7 +183,7 @@ with demo:
                 )
             )
-        with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
             DESCRIPTION_TEXT = """
             Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
             We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
@@ -190,21 +191,23 @@ with demo:
             """
             gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
-            leaderboard = overall_leaderboard(
-                get_model_leaderboard_df(
-                    model_result_path,
-                    benchmark_cols=[
-                        AutoEvalColumn.rank_overall.name,
-                        AutoEvalColumn.model.name,
-                        AutoEvalColumn.score_overall.name,
-                        AutoEvalColumn.sd_overall.name,
-                        AutoEvalColumn.license.name,
-                        AutoEvalColumn.organization.name,
-                        AutoEvalColumn.knowledge_cutoff.name,
-                        ],
-                    rank_col=[AutoEvalColumn.rank_overall.name],
-                ))
         with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
             DESCRIPTION_TEXT="""
             Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
@@ -223,7 +226,22 @@ with demo:
             gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
             # leaderboard = init_leaderboard(LEADERBOARD_DF)
-            with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
@@ -231,7 +249,7 @@ with demo:
                             AutoEvalColumn.rank_math_algebra.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_math_algebra.name,
-                            AutoEvalColumn.sd_math_algebra.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
@@ -240,7 +258,7 @@ with demo:
                     )
                 )
-            with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=1, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
@@ -248,7 +266,7 @@ with demo:
                             AutoEvalColumn.rank_math_geometry.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_math_geometry.name,
-                            AutoEvalColumn.sd_math_geometry.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
@@ -257,7 +275,7 @@ with demo:
                     )
                 )
-            with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=2, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
@@ -265,7 +283,7 @@ with demo:
                             AutoEvalColumn.rank_math_probability.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_math_probability.name,
-                            AutoEvalColumn.sd_math_probability.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
@@ -299,7 +317,20 @@ with demo:
             """
             gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
-            with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
@@ -307,7 +338,7 @@ with demo:
                             AutoEvalColumn.rank_reason_logical.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_reason_logical.name,
-                            AutoEvalColumn.sd_reason_logical.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
@@ -316,7 +347,7 @@ with demo:
                     )
                 )
-            with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=1, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
@@ -324,7 +355,7 @@ with demo:
                             AutoEvalColumn.rank_reason_social.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_reason_social.name,
-                            AutoEvalColumn.sd_reason_social.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
@@ -348,7 +379,19 @@ with demo:
             """
             gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
-            with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,

 # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
 # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
 # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
+# model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
 # model_result_path = "./src/results/models_2024-10-18-14:06:13.588399.json"
+model_result_path = "./src/results/models_2024-10-20-23:34:57.242641.json"
 # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
         with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
             DESCRIPTION_TEXT = """
+            Total #models: 57 (Last updated: 2024-10-21)
             This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks.
             (Missing values are due to the slow or problemtic model responses to be fixed soom.)
                 )
             )
+        with gr.TabItem("🎯 Mixed", elem_id="llm-benchmark-tab-table", id=1):
             DESCRIPTION_TEXT = """
             Overall dimension measures the comprehensive performance of LLMs across diverse tasks.
             We start with diverse questions from the widely-used [MT-Bench](https://arxiv.org/abs/2306.05685),
             """
             gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("MT-Bench", elem_id="mt-bench_subtab", id=0, elem_classes="subtab"):
+                leaderboard = overall_leaderboard(
+                    get_model_leaderboard_df(
+                        model_result_path,
+                        benchmark_cols=[
+                            AutoEvalColumn.rank_overall.name,
+                            AutoEvalColumn.model.name,
+                            AutoEvalColumn.score_overall.name,
+                            AutoEvalColumn.sd_overall.name,
+                            AutoEvalColumn.license.name,
+                            AutoEvalColumn.organization.name,
+                            AutoEvalColumn.knowledge_cutoff.name,
+                            ],
+                        rank_col=[AutoEvalColumn.rank_overall.name],
+                    ))
         with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
             DESCRIPTION_TEXT="""
             Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
             gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
             # leaderboard = init_leaderboard(LEADERBOARD_DF)
+            with gr.TabItem("Overall", elem_id="math_overall_subtab", id=0, elem_classes="subtab"):
+                leaderboard = overall_leaderboard(
+                    get_model_leaderboard_df(
+                        model_result_path,
+                        benchmark_cols=[
+                            AutoEvalColumn.model.name,
+                            AutoEvalColumn.rank_math_algebra.name,
+                            AutoEvalColumn.rank_math_geometry.name,
+                            AutoEvalColumn.rank_math_probability.name,
+                            ],
+                        rank_col=[],
+                    )
+                )
+            with gr.TabItem("🧮 Algebra", elem_id="algebra_subtab", id=1, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
                             AutoEvalColumn.rank_math_algebra.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_math_algebra.name,
+                            # AutoEvalColumn.sd_math_algebra.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
                     )
                 )
+            with gr.TabItem("📐 Geometry", elem_id="geometry_subtab", id=2, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
                             AutoEvalColumn.rank_math_geometry.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_math_geometry.name,
+                            # AutoEvalColumn.sd_math_geometry.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
                     )
                 )
+            with gr.TabItem("📊 Probability", elem_id="prob_subtab", id=3, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
                             AutoEvalColumn.rank_math_probability.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_math_probability.name,
+                            # AutoEvalColumn.sd_math_probability.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
             """
             gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("Overall", elem_id="reasoning_overall_subtab", id=0, elem_classes="subtab"):
+                leaderboard = overall_leaderboard(
+                    get_model_leaderboard_df(
+                        model_result_path,
+                        benchmark_cols=[
+                            AutoEvalColumn.model.name,
+                            AutoEvalColumn.rank_reason_logical.name,
+                            AutoEvalColumn.rank_reason_social.name,
+                            ],
+                        rank_col=[],
+                    )
+                )
+            with gr.TabItem("🧩 Logical", elem_id="logical_subtab", id=1, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
                             AutoEvalColumn.rank_reason_logical.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_reason_logical.name,
+                            # AutoEvalColumn.sd_reason_logical.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
                     )
                 )
+            with gr.TabItem("🗣️ Social", elem_id="social_subtab", id=2, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,
                             AutoEvalColumn.rank_reason_social.name,
                             AutoEvalColumn.model.name,
                             AutoEvalColumn.score_reason_social.name,
+                            # AutoEvalColumn.sd_reason_social.name,
                             AutoEvalColumn.license.name,
                             AutoEvalColumn.organization.name,
                             AutoEvalColumn.knowledge_cutoff.name,
             """
             gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("Overall", elem_id="science_overall_subtab", id=0, elem_classes="subtab"):
+                leaderboard = overall_leaderboard(
+                    get_model_leaderboard_df(
+                        model_result_path,
+                        benchmark_cols=[
+                            AutoEvalColumn.model.name,
+                            AutoEvalColumn.rank_chemistry.name,
+                            ],
+                        rank_col=[],
+                    )
+                )
+            with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=1, elem_classes="subtab"):
                 leaderboard = overall_leaderboard(
                     get_model_leaderboard_df(
                         model_result_path,

src/display/utils.py CHANGED Viewed

@@ -64,35 +64,48 @@ auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=l
 auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
 # fine-grained dimensions
-auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
-auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
-auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
-auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Probability)", "number", True))])
-auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Logical Reasoning)", "number", True))])
-auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Social Reasoning)", "number", True))])
-auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev(Overall)", "number", True))])
 auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Algebra)", "number", True))])
-auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Geometry)", "number", True))])
-auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Probability)", "number", True))])
-auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Logical Reasoning)", "number", True))])
-auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Social Reasoning)", "number", True))])
-auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Overall)", "number", True))])
 auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Algebra)", "number", True))])
 auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Geometry)", "number", True))])
 auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Probability)", "number", True))])
 auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
 auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
 auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Chemistry)", "number", True))])
 auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
 auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
 auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
 auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
 auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])

 auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
 # fine-grained dimensions
+auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (MT-Bench)", "number", True))])
+auto_eval_column_dict.append(["sd_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev(MT-Bench)", "number", True))])
+auto_eval_column_dict.append(["rank_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (MT-Bench)", "number", True))])
+auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
 auto_eval_column_dict.append(["sd_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Algebra)", "number", True))])
 auto_eval_column_dict.append(["rank_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Algebra)", "number", True))])
+auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
+auto_eval_column_dict.append(["sd_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Geometry)", "number", True))])
 auto_eval_column_dict.append(["rank_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Geometry)", "number", True))])
+auto_eval_column_dict.append(["score_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Probability)", "number", True))])
+auto_eval_column_dict.append(["sd_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Math Probability)", "number", True))])
 auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Math Probability)", "number", True))])
+auto_eval_column_dict.append(["score_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Logical Reasoning)", "number", True))])
+auto_eval_column_dict.append(["sd_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Logical Reasoning)", "number", True))])
 auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
+auto_eval_column_dict.append(["score_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Social Reasoning)", "number", True))])
+auto_eval_column_dict.append(["sd_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Social Reasoning)", "number", True))])
 auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
 auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Chemistry)", "number", True))])
 auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
 auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
+auto_eval_column_dict.append(["score_physics", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Physics)", "number", True))])
+auto_eval_column_dict.append(["sd_physics", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Physics)", "number", True))])
+auto_eval_column_dict.append(["rank_physics", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Physics)", "number", True))])
+auto_eval_column_dict.append(["score_biology", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Biology)", "number", True))])
+auto_eval_column_dict.append(["sd_biology", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Biology)", "number", True))])
+auto_eval_column_dict.append(["rank_biology", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Biology)", "number", True))])
 auto_eval_column_dict.append(["score_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Score (C++)", "number", True))])
 auto_eval_column_dict.append(["sd_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (C++)", "number", True))])
 auto_eval_column_dict.append(["rank_cpp", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (C++)", "number", True))])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])

src/populate.py CHANGED Viewed

@@ -15,14 +15,20 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_model_results(results_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df[benchmark_cols]
     # print(df.head())
-    if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
-        df = df.dropna(subset=benchmark_cols)
         df = df.sort_values(by=[rank_col[0]], ascending=True)
         # print(rank_col, benchmark_cols)
         # print(df.head())
@@ -31,7 +37,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
         avg_rank = df.iloc[:, 1:].mean(axis=1)
         df["Average Rank"] = avg_rank.round(decimals=4)
         df = df.sort_values(by=["Average Rank"], ascending=True)
-        df["Average Rank"] = df["Average Rank"].map('{:.4f}'.format)
         # we'll skip NaN, instrad of deleting the whole row
         df = df.fillna('--')
@@ -41,19 +47,25 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
     for col in benchmark_cols:
-        # print(col)
-        # if 'Std dev' in col or 'Score' in col:
         if 'Std dev' in col or 'Score' in col:
-        #     if set(['Chemistry', 'Reasoning']).intersection(set(col.split())):
-        #         df[col] = (df[col]).map('{:.2f}'.format)
-        #     else:
-        #         df[col] = (df[col]*100).map('{:.2f}'.format)
-            if "Chemistry" in col or "C++" in col:
-            # if "Chemistry" in col or "C++" in col or "Overall" in col or "Probability" in col or "Logical" in col:
-                df[col] = (df[col]).map('{:.2f}'.format)
-            else:
-                df[col] = (df[col]*100).map('{:.2f}'.format)
             df[col] = df[col].round(decimals=2)
     # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
     # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")

     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_model_results(results_path)
     all_data_json = [v.to_dict() for v in raw_data]
+    assert len(rank_col) <= 1, "Only one column can be selected for ranking"
     df = pd.DataFrame.from_records(all_data_json)
     df = df[benchmark_cols]
     # print(df.head())
+    # if there is one col in rank_col, this is an isolated dimension to rank by
+    # sort by that selected column and remove NaN values
+    if rank_col:
+        # df = df.dropna(subset=benchmark_cols)
+        df = df.dropna(subset=rank_col)
+        df = df.fillna(0.00)
+        # print(df[rank_col[0]])
         df = df.sort_values(by=[rank_col[0]], ascending=True)
         # print(rank_col, benchmark_cols)
         # print(df.head())
         avg_rank = df.iloc[:, 1:].mean(axis=1)
         df["Average Rank"] = avg_rank.round(decimals=4)
         df = df.sort_values(by=["Average Rank"], ascending=True)
+        df["Average Rank"] = df["Average Rank"].map('{:.2f}'.format)
         # we'll skip NaN, instrad of deleting the whole row
         df = df.fillna('--')
     for col in benchmark_cols:
         if 'Std dev' in col or 'Score' in col:
+            df[col] = (df[col]).map('{:.2f}'.format)
             df[col] = df[col].round(decimals=2)
+    # for col in benchmark_cols:
+    #     # print(col)
+    #     # if 'Std dev' in col or 'Score' in col:
+    #     if 'Std dev' in col or 'Score' in col:
+    #     #     if set(['Chemistry', 'Reasoning']).intersection(set(col.split())):
+    #     #         df[col] = (df[col]).map('{:.2f}'.format)
+    #     #     else:
+    #     #         df[col] = (df[col]*100).map('{:.2f}'.format)
+    #         # if "Chemistry" in col or "C++" in col:
+    #         if "Chemistry" in col or "C++" in col or "Overall" in col or "Probability" in col or "Logical" in col:
+    #             df[col] = (df[col]).map('{:.2f}'.format)
+    #         else:
+    #             df[col] = (df[col]*100).map('{:.2f}'.format)
+    #         df[col] = df[col].round(decimals=2)
     # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
     # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")

src/results/models_2024-10-20-23:34:57.242641.json ADDED Viewed

	@@ -0,0 +1,2802 @@

+[
+    {
+        "config": {
+            "model_name": "ChatGPT-4o-latest (2024-09-03)",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/10"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 93.51557945652831,
+                "Standard Deviation": 3.1900396436407785,
+                "Rank": 4
+            },
+            "Geometry": {
+                "Average Score": 81.8536937387725,
+                "Standard Deviation": null,
+                "Rank": 5
+            },
+            "Algebra": {
+                "Average Score": 89.3642910524324,
+                "Standard Deviation": null,
+                "Rank": 3
+            },
+            "Probability": {
+                "Average Score": 86.55761073510537,
+                "Standard Deviation": null,
+                "Rank": 4
+            },
+            "Logical": {
+                "Average Score": 97.39734315785844,
+                "Standard Deviation": null,
+                "Rank": 2
+            },
+            "Social": {
+                "Average Score": 91.03727530739368,
+                "Standard Deviation": null,
+                "Rank": 7
+            },
+            "Chemistry": {
+                "Average Score": 100.0,
+                "Standard Deviation": null,
+                "Rank": 1
+            },
+            "CPP": {
+                "Average Score": 100.0,
+                "Standard Deviation": null,
+                "Rank": 1
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gpt-4o-2024-08-06",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/10"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 79.7806321863411,
+                "Standard Deviation": 0.8302330946013555,
+                "Rank": 14
+            },
+            "Geometry": {
+                "Average Score": 86.29041459755453,
+                "Standard Deviation": null,
+                "Rank": 2
+            },
+            "Algebra": {
+                "Average Score": 88.53373721863113,
+                "Standard Deviation": null,
+                "Rank": 4
+            },
+            "Probability": {
+                "Average Score": 78.694360721361,
+                "Standard Deviation": null,
+                "Rank": 7
+            },
+            "Logical": {
+                "Average Score": 78.3116623496895,
+                "Standard Deviation": null,
+                "Rank": 12
+            },
+            "Social": {
+                "Average Score": 79.90944696263446,
+                "Standard Deviation": null,
+                "Rank": 11
+            },
+            "Chemistry": {
+                "Average Score": 86.96011263543132,
+                "Standard Deviation": null,
+                "Rank": 7
+            },
+            "CPP": {
+                "Average Score": 92.43090226400756,
+                "Standard Deviation": null,
+                "Rank": 2
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gpt-4o-2024-05-13",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/10"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 86.40675398236253,
+                "Standard Deviation": 6.473604235710212,
+                "Rank": 9
+            },
+            "Geometry": {
+                "Average Score": 82.42032988843268,
+                "Standard Deviation": null,
+                "Rank": 4
+            },
+            "Algebra": {
+                "Average Score": 83.51580675782952,
+                "Standard Deviation": null,
+                "Rank": 9
+            },
+            "Probability": {
+                "Average Score": 81.88434691830915,
+                "Standard Deviation": null,
+                "Rank": 5
+            },
+            "Logical": {
+                "Average Score": 87.92744931984977,
+                "Standard Deviation": null,
+                "Rank": 9
+            },
+            "Social": {
+                "Average Score": 76.12369632852445,
+                "Standard Deviation": null,
+                "Rank": 15
+            },
+            "Chemistry": {
+                "Average Score": 90.93459148149344,
+                "Standard Deviation": null,
+                "Rank": 4
+            },
+            "CPP": {
+                "Average Score": 79.1592634699295,
+                "Standard Deviation": null,
+                "Rank": 6
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gpt-4-turbo-2024-04-09",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/12"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 87.17581147282237,
+                "Standard Deviation": 8.716963621850567,
+                "Rank": 8
+            },
+            "Geometry": {
+                "Average Score": 78.76635545274637,
+                "Standard Deviation": null,
+                "Rank": 7
+            },
+            "Algebra": {
+                "Average Score": 79.96323615621023,
+                "Standard Deviation": null,
+                "Rank": 11
+            },
+            "Probability": {
+                "Average Score": 77.65333799733705,
+                "Standard Deviation": null,
+                "Rank": 9
+            },
+            "Logical": {
+                "Average Score": 89.33307138659873,
+                "Standard Deviation": null,
+                "Rank": 8
+            },
+            "Social": {
+                "Average Score": 76.86597570996584,
+                "Standard Deviation": null,
+                "Rank": 14
+            },
+            "Chemistry": {
+                "Average Score": 84.02855687506661,
+                "Standard Deviation": null,
+                "Rank": 9
+            },
+            "CPP": {
+                "Average Score": 70.73143363230263,
+                "Standard Deviation": null,
+                "Rank": 11
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemini-1.5-pro-001",
+            "organization": "Google",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/11"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 80.38345723548734,
+                "Standard Deviation": 2.4635699815143584,
+                "Rank": 13
+            },
+            "Geometry": {
+                "Average Score": 84.30455076458965,
+                "Standard Deviation": null,
+                "Rank": 3
+            },
+            "Algebra": {
+                "Average Score": 85.9212061409364,
+                "Standard Deviation": null,
+                "Rank": 6
+            },
+            "Probability": {
+                "Average Score": 73.11806712394745,
+                "Standard Deviation": null,
+                "Rank": 13
+            },
+            "Logical": {
+                "Average Score": 78.27369746632996,
+                "Standard Deviation": null,
+                "Rank": 12
+            },
+            "Social": {
+                "Average Score": 79.57606824531047,
+                "Standard Deviation": null,
+                "Rank": 13
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "qwen2-72b-instruct",
+            "organization": "Alibaba",
+            "license": "Qianwen LICENSE",
+            "knowledge_cutoff": "2024/09"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 74.44059692248071,
+                "Standard Deviation": 2.3957041566666697,
+                "Rank": 16
+            },
+            "Geometry": {
+                "Average Score": 72.58490369919883,
+                "Standard Deviation": null,
+                "Rank": 11
+            },
+            "Algebra": {
+                "Average Score": 88.53359632761772,
+                "Standard Deviation": null,
+                "Rank": 4
+            },
+            "Probability": {
+                "Average Score": 80.19789976985243,
+                "Standard Deviation": null,
+                "Rank": 6
+            },
+            "Logical": {
+                "Average Score": 72.76843081200641,
+                "Standard Deviation": null,
+                "Rank": 17
+            },
+            "Social": {
+                "Average Score": 57.256064868444426,
+                "Standard Deviation": null,
+                "Rank": 19
+            },
+            "Chemistry": {
+                "Average Score": 75.47190401351077,
+                "Standard Deviation": null,
+                "Rank": 12
+            },
+            "CPP": {
+                "Average Score": 73.54037778797029,
+                "Standard Deviation": null,
+                "Rank": 7
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gpt-4o-mini-2024-07-18",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/10"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 82.82456893277315,
+                "Standard Deviation": 7.714840109805867,
+                "Rank": 12
+            },
+            "Geometry": {
+                "Average Score": 78.89323869622943,
+                "Standard Deviation": null,
+                "Rank": 6
+            },
+            "Algebra": {
+                "Average Score": 84.8722603687823,
+                "Standard Deviation": null,
+                "Rank": 8
+            },
+            "Probability": {
+                "Average Score": 78.6942843346463,
+                "Standard Deviation": null,
+                "Rank": 7
+            },
+            "Logical": {
+                "Average Score": 85.68921109829361,
+                "Standard Deviation": null,
+                "Rank": 10
+            },
+            "Social": {
+                "Average Score": 81.79892848722542,
+                "Standard Deviation": null,
+                "Rank": 10
+            },
+            "Chemistry": {
+                "Average Score": 81.46805623180109,
+                "Standard Deviation": null,
+                "Rank": 10
+            },
+            "CPP": {
+                "Average Score": 88.3877070580296,
+                "Standard Deviation": null,
+                "Rank": 3
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "claude-3.5-sonnet",
+            "organization": "Anthropic",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2024/04"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 88.43557924843628,
+                "Standard Deviation": 5.680338106806327,
+                "Rank": 7
+            },
+            "Geometry": {
+                "Average Score": 76.26169400931595,
+                "Standard Deviation": null,
+                "Rank": 10
+            },
+            "Algebra": {
+                "Average Score": 77.15040433072186,
+                "Standard Deviation": null,
+                "Rank": 13
+            },
+            "Probability": {
+                "Average Score": 73.9942759783754,
+                "Standard Deviation": null,
+                "Rank": 11
+            },
+            "Logical": {
+                "Average Score": 89.70827617930533,
+                "Standard Deviation": null,
+                "Rank": 7
+            },
+            "Social": {
+                "Average Score": 97.3810636467068,
+                "Standard Deviation": null,
+                "Rank": 3
+            },
+            "Chemistry": {
+                "Average Score": 94.92819763202698,
+                "Standard Deviation": null,
+                "Rank": 3
+            },
+            "CPP": {
+                "Average Score": 82.37734076815008,
+                "Standard Deviation": null,
+                "Rank": 5
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "o1-mini",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/10"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 96.12399889226096,
+                "Standard Deviation": 0.5674965705992511,
+                "Rank": 2
+            },
+            "Geometry": {
+                "Average Score": 100.0,
+                "Standard Deviation": null,
+                "Rank": 1
+            },
+            "Algebra": {
+                "Average Score": 100.0,
+                "Standard Deviation": null,
+                "Rank": 1
+            },
+            "Probability": {
+                "Average Score": 100.0,
+                "Standard Deviation": null,
+                "Rank": 1
+            },
+            "Logical": {
+                "Average Score": 96.52089445393929,
+                "Standard Deviation": null,
+                "Rank": 3
+            },
+            "Social": {
+                "Average Score": 95.00695256918654,
+                "Standard Deviation": null,
+                "Rank": 5
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "o1-preview",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/10"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 91.08240629161766,
+                "Standard Deviation": 4.83378135710071,
+                "Rank": 5
+            },
+            "Geometry": {
+                "Average Score": "N/A",
+                "Standard Deviation": "N/A",
+                "Rank": "N/A"
+            },
+            "Algebra": {
+                "Average Score": 98.1870991822192,
+                "Standard Deviation": null,
+                "Rank": 2
+            },
+            "Probability": {
+                "Average Score": 94.12657646584134,
+                "Standard Deviation": null,
+                "Rank": 2
+            },
+            "Logical": {
+                "Average Score": 100.0,
+                "Standard Deviation": null,
+                "Rank": 1
+            },
+            "Social": {
+                "Average Score": 96.56802743955569,
+                "Standard Deviation": null,
+                "Rank": 4
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemini-1.5-flash-001",
+            "organization": "Google",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/11"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 66.25275609135964,
+                "Standard Deviation": 2.5314573702881438,
+                "Rank": 20
+            },
+            "Geometry": {
+                "Average Score": 66.8010242138006,
+                "Standard Deviation": null,
+                "Rank": 13
+            },
+            "Algebra": {
+                "Average Score": 78.24639082497596,
+                "Standard Deviation": null,
+                "Rank": 12
+            },
+            "Probability": {
+                "Average Score": 67.84602916736804,
+                "Standard Deviation": null,
+                "Rank": 15
+            },
+            "Logical": {
+                "Average Score": 72.76845749138818,
+                "Standard Deviation": null,
+                "Rank": 17
+            },
+            "Social": {
+                "Average Score": 68.57728479711058,
+                "Standard Deviation": null,
+                "Rank": 16
+            },
+            "Chemistry": {
+                "Average Score": 75.47188329078935,
+                "Standard Deviation": null,
+                "Rank": 12
+            },
+            "CPP": {
+                "Average Score": 72.1127762005651,
+                "Standard Deviation": null,
+                "Rank": 10
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gpt4-1106",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2024/04"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 85.660054434658,
+                "Standard Deviation": 7.392502344300497,
+                "Rank": 10
+            },
+            "Geometry": {
+                "Average Score": 63.36396165140893,
+                "Standard Deviation": null,
+                "Rank": 15
+            },
+            "Algebra": {
+                "Average Score": 74.67191687355754,
+                "Standard Deviation": null,
+                "Rank": 15
+            },
+            "Probability": {
+                "Average Score": 71.35141952665965,
+                "Standard Deviation": null,
+                "Rank": 14
+            },
+            "Logical": {
+                "Average Score": 76.34506017196868,
+                "Standard Deviation": null,
+                "Rank": 15
+            },
+            "Social": {
+                "Average Score": 46.00126575332808,
+                "Standard Deviation": null,
+                "Rank": 25
+            },
+            "Chemistry": {
+                "Average Score": 78.70156756289569,
+                "Standard Deviation": null,
+                "Rank": 11
+            },
+            "CPP": {
+                "Average Score": 69.11824072252848,
+                "Standard Deviation": null,
+                "Rank": 12
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemma-2-27b-it",
+            "organization": "Google",
+            "license": "Gemma License",
+            "knowledge_cutoff": "2024/06"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 70.82622192650408,
+                "Standard Deviation": 0.18962869075029884,
+                "Rank": 18
+            },
+            "Geometry": {
+                "Average Score": 58.25724467150374,
+                "Standard Deviation": null,
+                "Rank": 16
+            },
+            "Algebra": {
+                "Average Score": 73.71614711121721,
+                "Standard Deviation": null,
+                "Rank": 16
+            },
+            "Probability": {
+                "Average Score": 66.08200742339983,
+                "Standard Deviation": null,
+                "Rank": 17
+            },
+            "Logical": {
+                "Average Score": 72.76841354275011,
+                "Standard Deviation": null,
+                "Rank": 17
+            },
+            "Social": {
+                "Average Score": 53.736358144621576,
+                "Standard Deviation": null,
+                "Rank": 21
+            },
+            "Chemistry": {
+                "Average Score": 68.1178055540124,
+                "Standard Deviation": null,
+                "Rank": 17
+            },
+            "CPP": {
+                "Average Score": 63.28920072143611,
+                "Standard Deviation": null,
+                "Rank": 14
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "claude-3-opus",
+            "organization": "Anthropic",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/08"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 82.28903171580336,
+                "Standard Deviation": 10.093273304495547,
+                "Rank": 11
+            },
+            "Geometry": {
+                "Average Score": 57.98602891013921,
+                "Standard Deviation": null,
+                "Rank": 17
+            },
+            "Algebra": {
+                "Average Score": 73.54334730242743,
+                "Standard Deviation": null,
+                "Rank": 18
+            },
+            "Probability": {
+                "Average Score": 67.8341594991468,
+                "Standard Deviation": null,
+                "Rank": 15
+            },
+            "Logical": {
+                "Average Score": 78.31155849680502,
+                "Standard Deviation": null,
+                "Rank": 12
+            },
+            "Social": {
+                "Average Score": 90.45833112761075,
+                "Standard Deviation": null,
+                "Rank": 8
+            },
+            "Chemistry": {
+                "Average Score": 85.97349470177741,
+                "Standard Deviation": null,
+                "Rank": 8
+            },
+            "CPP": {
+                "Average Score": 73.5404403567132,
+                "Standard Deviation": null,
+                "Rank": 8
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemma-2-9b-it-simpo",
+            "organization": "Google",
+            "license": "Gemma License",
+            "knowledge_cutoff": "2024/07"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": "N/A",
+                "Standard Deviation": "N/A",
+                "Rank": "N/A"
+            },
+            "Geometry": {
+                "Average Score": 52.80896798216458,
+                "Standard Deviation": null,
+                "Rank": 19
+            },
+            "Algebra": {
+                "Average Score": 69.60260038105677,
+                "Standard Deviation": null,
+                "Rank": 19
+            },
+            "Probability": {
+                "Average Score": 59.52630271491633,
+                "Standard Deviation": null,
+                "Rank": 21
+            },
+            "Logical": {
+                "Average Score": 63.57920031465781,
+                "Standard Deviation": null,
+                "Rank": 23
+            },
+            "Social": {
+                "Average Score": 79.90950201631269,
+                "Standard Deviation": null,
+                "Rank": 11
+            },
+            "Chemistry": {
+                "Average Score": 90.36508196626548,
+                "Standard Deviation": null,
+                "Rank": 5
+            },
+            "CPP": {
+                "Average Score": 73.43757596214863,
+                "Standard Deviation": null,
+                "Rank": 9
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "qwen1.5-72b-chat",
+            "organization": "Alibaba",
+            "license": "Qianwen LICENSE",
+            "knowledge_cutoff": "2024/03"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 65.26710370586439,
+                "Standard Deviation": 9.198700753743012,
+                "Rank": 19
+            },
+            "Geometry": {
+                "Average Score": 48.52417714351894,
+                "Standard Deviation": null,
+                "Rank": 24
+            },
+            "Algebra": {
+                "Average Score": 68.55765479604507,
+                "Standard Deviation": null,
+                "Rank": 20
+            },
+            "Probability": {
+                "Average Score": 49.52382148131357,
+                "Standard Deviation": null,
+                "Rank": 26
+            },
+            "Logical": {
+                "Average Score": 37.33563924001827,
+                "Standard Deviation": null,
+                "Rank": 35
+            },
+            "Social": {
+                "Average Score": 46.00141195402727,
+                "Standard Deviation": null,
+                "Rank": 25
+            },
+            "Chemistry": {
+                "Average Score": 52.625823960166215,
+                "Standard Deviation": null,
+                "Rank": 23
+            },
+            "CPP": {
+                "Average Score": 48.69302376665551,
+                "Standard Deviation": null,
+                "Rank": 20
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "qwen1.5-32b-chat",
+            "organization": "Alibaba",
+            "license": "Qianwen LICENSE",
+            "knowledge_cutoff": "2024/03"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 46.74335731441104,
+                "Standard Deviation": 4.096227849530709,
+                "Rank": 28
+            },
+            "Geometry": {
+                "Average Score": 44.96670224519297,
+                "Standard Deviation": null,
+                "Rank": 26
+            },
+            "Algebra": {
+                "Average Score": 63.19715848628476,
+                "Standard Deviation": null,
+                "Rank": 23
+            },
+            "Probability": {
+                "Average Score": 48.59873650270336,
+                "Standard Deviation": null,
+                "Rank": 27
+            },
+            "Logical": {
+                "Average Score": 42.028753105249216,
+                "Standard Deviation": null,
+                "Rank": 33
+            },
+            "Social": {
+                "Average Score": 43.183938768454986,
+                "Standard Deviation": null,
+                "Rank": 28
+            },
+            "Chemistry": {
+                "Average Score": 47.84488021045937,
+                "Standard Deviation": null,
+                "Rank": 26
+            },
+            "CPP": {
+                "Average Score": 45.14284028264288,
+                "Standard Deviation": null,
+                "Rank": 24
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "google-gemma-2-9b-it",
+            "organization": "Google",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2024/06"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 60.71065949101693,
+                "Standard Deviation": 0.12283018509137462,
+                "Rank": 23
+            },
+            "Geometry": {
+                "Average Score": 52.49270527783856,
+                "Standard Deviation": null,
+                "Rank": 20
+            },
+            "Algebra": {
+                "Average Score": 63.446032975128176,
+                "Standard Deviation": null,
+                "Rank": 21
+            },
+            "Probability": {
+                "Average Score": 63.95287475488081,
+                "Standard Deviation": null,
+                "Rank": 20
+            },
+            "Logical": {
+                "Average Score": 70.18644584116615,
+                "Standard Deviation": null,
+                "Rank": 20
+            },
+            "Social": {
+                "Average Score": 86.45401862572464,
+                "Standard Deviation": null,
+                "Rank": 9
+            },
+            "Chemistry": {
+                "Average Score": 57.56342217758078,
+                "Standard Deviation": null,
+                "Rank": 20
+            },
+            "CPP": {
+                "Average Score": 54.03167523687635,
+                "Standard Deviation": null,
+                "Rank": 17
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "yi-1.5-34b-chat",
+            "organization": "01 AI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2024/05"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 71.53811567931923,
+                "Standard Deviation": 0.4838075734512934,
+                "Rank": 17
+            },
+            "Geometry": {
+                "Average Score": 53.98343904373819,
+                "Standard Deviation": null,
+                "Rank": 18
+            },
+            "Algebra": {
+                "Average Score": 63.317896075817885,
+                "Standard Deviation": null,
+                "Rank": 22
+            },
+            "Probability": {
+                "Average Score": 64.73492918491159,
+                "Standard Deviation": null,
+                "Rank": 19
+            },
+            "Logical": {
+                "Average Score": 66.39420245024361,
+                "Standard Deviation": null,
+                "Rank": 21
+            },
+            "Social": {
+                "Average Score": 53.73650350964252,
+                "Standard Deviation": null,
+                "Rank": 21
+            },
+            "Chemistry": {
+                "Average Score": 56.722360677914686,
+                "Standard Deviation": null,
+                "Rank": 21
+            },
+            "CPP": {
+                "Average Score": 52.148798061768964,
+                "Standard Deviation": null,
+                "Rank": 18
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "meta-llama-3.1-70b-instruct",
+            "organization": "Meta",
+            "license": "Llama 3.1 Community",
+            "knowledge_cutoff": "2023/12"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 74.01502078434305,
+                "Standard Deviation": 0.24116839515156926,
+                "Rank": 15
+            },
+            "Geometry": {
+                "Average Score": 66.80097850274383,
+                "Standard Deviation": null,
+                "Rank": 13
+            },
+            "Algebra": {
+                "Average Score": 74.7667367179752,
+                "Standard Deviation": null,
+                "Rank": 14
+            },
+            "Probability": {
+                "Average Score": 66.0819470113051,
+                "Standard Deviation": null,
+                "Rank": 17
+            },
+            "Logical": {
+                "Average Score": 73.68238947162197,
+                "Standard Deviation": null,
+                "Rank": 16
+            },
+            "Social": {
+                "Average Score": 68.577541438994,
+                "Standard Deviation": null,
+                "Rank": 16
+            },
+            "Chemistry": {
+                "Average Score": 70.4019514562452,
+                "Standard Deviation": null,
+                "Rank": 15
+            },
+            "CPP": {
+                "Average Score": 84.36815192532764,
+                "Standard Deviation": null,
+                "Rank": 4
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "meta-llama-3.1-8b-instruct",
+            "organization": "Meta",
+            "license": "Llama 3.1 Community",
+            "knowledge_cutoff": "2023/12"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 55.268736955905695,
+                "Standard Deviation": 7.060517225126177,
+                "Rank": 26
+            },
+            "Geometry": {
+                "Average Score": 42.44262022417502,
+                "Standard Deviation": null,
+                "Rank": 28
+            },
+            "Algebra": {
+                "Average Score": 60.632347391080486,
+                "Standard Deviation": null,
+                "Rank": 25
+            },
+            "Probability": {
+                "Average Score": 52.372362507453694,
+                "Standard Deviation": null,
+                "Rank": 24
+            },
+            "Logical": {
+                "Average Score": 54.17571378414435,
+                "Standard Deviation": null,
+                "Rank": 28
+            },
+            "Social": {
+                "Average Score": 39.07966801070027,
+                "Standard Deviation": null,
+                "Rank": 31
+            },
+            "Chemistry": {
+                "Average Score": 45.0170262190059,
+                "Standard Deviation": null,
+                "Rank": 29
+            },
+            "CPP": {
+                "Average Score": 44.41846841004584,
+                "Standard Deviation": null,
+                "Rank": 26
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gpt3.5-turbo-0125",
+            "organization": "OpenAI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2021/09"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 29.17379433602279,
+                "Standard Deviation": 2.6813415847393878,
+                "Rank": 44
+            },
+            "Geometry": {
+                "Average Score": 51.47279337094397,
+                "Standard Deviation": null,
+                "Rank": 21
+            },
+            "Algebra": {
+                "Average Score": 59.03601450977881,
+                "Standard Deviation": null,
+                "Rank": 26
+            },
+            "Probability": {
+                "Average Score": 46.71541304474977,
+                "Standard Deviation": null,
+                "Rank": 28
+            },
+            "Logical": {
+                "Average Score": 20.82026871015984,
+                "Standard Deviation": null,
+                "Rank": 46
+            },
+            "Social": {
+                "Average Score": 28.31096293069848,
+                "Standard Deviation": null,
+                "Rank": 41
+            },
+            "Chemistry": {
+                "Average Score": 42.899594571904004,
+                "Standard Deviation": null,
+                "Rank": 31
+            },
+            "CPP": {
+                "Average Score": 40.46958736582551,
+                "Standard Deviation": null,
+                "Rank": 29
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "llama-3-70b-instruct",
+            "organization": "Meta",
+            "license": "Llama 3 Community",
+            "knowledge_cutoff": "2023/12"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 65.90407336557487,
+                "Standard Deviation": 66.63940143516267,
+                "Rank": 24
+            },
+            "Geometry": {
+                "Average Score": 46.40555349958932,
+                "Standard Deviation": null,
+                "Rank": 25
+            },
+            "Algebra": {
+                "Average Score": 60.86276607976933,
+                "Standard Deviation": null,
+                "Rank": 24
+            },
+            "Probability": {
+                "Average Score": 55.0233135868055,
+                "Standard Deviation": null,
+                "Rank": 22
+            },
+            "Logical": {
+                "Average Score": 83.99546392889077,
+                "Standard Deviation": null,
+                "Rank": 11
+            },
+            "Social": {
+                "Average Score": 47.90189246663785,
+                "Standard Deviation": null,
+                "Rank": 23
+            },
+            "Chemistry": {
+                "Average Score": 70.40198909396582,
+                "Standard Deviation": null,
+                "Rank": 15
+            },
+            "CPP": {
+                "Average Score": 65.32140697218945,
+                "Standard Deviation": null,
+                "Rank": 13
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "claude-3-sonnet",
+            "organization": "Anthropic",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/08"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 64.4278622266347,
+                "Standard Deviation": 3.089828107392469,
+                "Rank": 21
+            },
+            "Geometry": {
+                "Average Score": 51.4677627365698,
+                "Standard Deviation": null,
+                "Rank": 21
+            },
+            "Algebra": {
+                "Average Score": 57.157810499255426,
+                "Standard Deviation": null,
+                "Rank": 27
+            },
+            "Probability": {
+                "Average Score": 54.68761427070592,
+                "Standard Deviation": null,
+                "Rank": 23
+            },
+            "Logical": {
+                "Average Score": 65.8346271849297,
+                "Standard Deviation": null,
+                "Rank": 22
+            },
+            "Social": {
+                "Average Score": 62.842721798877186,
+                "Standard Deviation": null,
+                "Rank": 18
+            },
+            "Chemistry": {
+                "Average Score": 66.1914400411681,
+                "Standard Deviation": null,
+                "Rank": 18
+            },
+            "CPP": {
+                "Average Score": 61.33538592327427,
+                "Standard Deviation": null,
+                "Rank": 15
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "qwen1.5-14b-chat",
+            "organization": "Alibaba",
+            "license": "Qianwen LICENSE",
+            "knowledge_cutoff": "2024/02"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 44.920016997055804,
+                "Standard Deviation": 0.3041914765974254,
+                "Rank": 30
+            },
+            "Geometry": {
+                "Average Score": 36.40735570120079,
+                "Standard Deviation": null,
+                "Rank": 30
+            },
+            "Algebra": {
+                "Average Score": 56.004717588310726,
+                "Standard Deviation": null,
+                "Rank": 28
+            },
+            "Probability": {
+                "Average Score": 39.24866255465088,
+                "Standard Deviation": null,
+                "Rank": 33
+            },
+            "Logical": {
+                "Average Score": 35.15462916949486,
+                "Standard Deviation": null,
+                "Rank": 38
+            },
+            "Social": {
+                "Average Score": 35.236185321936766,
+                "Standard Deviation": null,
+                "Rank": 34
+            },
+            "Chemistry": {
+                "Average Score": 40.803706763362605,
+                "Standard Deviation": null,
+                "Rank": 34
+            },
+            "CPP": {
+                "Average Score": 38.552779976347026,
+                "Standard Deviation": null,
+                "Rank": 31
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "claude-3-haiku",
+            "organization": "Anthropic",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/08"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 53.46814061793852,
+                "Standard Deviation": 10.143567097006747,
+                "Rank": 25
+            },
+            "Geometry": {
+                "Average Score": 42.87542087805953,
+                "Standard Deviation": null,
+                "Rank": 27
+            },
+            "Algebra": {
+                "Average Score": 53.706856083803686,
+                "Standard Deviation": null,
+                "Rank": 30
+            },
+            "Probability": {
+                "Average Score": 49.80372052799326,
+                "Standard Deviation": null,
+                "Rank": 25
+            },
+            "Logical": {
+                "Average Score": 62.585349577709394,
+                "Standard Deviation": null,
+                "Rank": 24
+            },
+            "Social": {
+                "Average Score": 57.25601125762336,
+                "Standard Deviation": null,
+                "Rank": 19
+            },
+            "Chemistry": {
+                "Average Score": 60.48921113945562,
+                "Standard Deviation": null,
+                "Rank": 19
+            },
+            "CPP": {
+                "Average Score": 56.40200048817984,
+                "Standard Deviation": null,
+                "Rank": 16
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "claude-2.1",
+            "organization": "Anthropic",
+            "license": "Proprietary",
+            "knowledge_cutoff": "Unknown"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 39.855928282633364,
+                "Standard Deviation": 8.396129652430814,
+                "Rank": 35
+            },
+            "Geometry": {
+                "Average Score": 51.1749207092159,
+                "Standard Deviation": null,
+                "Rank": 23
+            },
+            "Algebra": {
+                "Average Score": 53.05386216145516,
+                "Standard Deviation": null,
+                "Rank": 31
+            },
+            "Probability": {
+                "Average Score": 44.42150447611455,
+                "Standard Deviation": null,
+                "Rank": 30
+            },
+            "Logical": {
+                "Average Score": 60.51381867118053,
+                "Standard Deviation": null,
+                "Rank": 25
+            },
+            "Social": {
+                "Average Score": 38.492280755756035,
+                "Standard Deviation": null,
+                "Rank": 32
+            },
+            "Chemistry": {
+                "Average Score": 50.66182745698702,
+                "Standard Deviation": null,
+                "Rank": 24
+            },
+            "CPP": {
+                "Average Score": 47.23672563994903,
+                "Standard Deviation": null,
+                "Rank": 21
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "mistral-8x7b-instruct-v0.1",
+            "organization": "Mistral",
+            "license": "Apache 2.0",
+            "knowledge_cutoff": "2023/12"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 42.70451051343715,
+                "Standard Deviation": 9.965602920103015,
+                "Rank": 31
+            },
+            "Geometry": {
+                "Average Score": 33.473933494899164,
+                "Standard Deviation": null,
+                "Rank": 34
+            },
+            "Algebra": {
+                "Average Score": 48.99207852115047,
+                "Standard Deviation": null,
+                "Rank": 34
+            },
+            "Probability": {
+                "Average Score": 44.46936520340586,
+                "Standard Deviation": null,
+                "Rank": 30
+            },
+            "Logical": {
+                "Average Score": 42.656238987207246,
+                "Standard Deviation": null,
+                "Rank": 31
+            },
+            "Social": {
+                "Average Score": 30.32900110312259,
+                "Standard Deviation": null,
+                "Rank": 40
+            },
+            "Chemistry": {
+                "Average Score": 47.047104057571026,
+                "Standard Deviation": null,
+                "Rank": 27
+            },
+            "CPP": {
+                "Average Score": 44.533118241976666,
+                "Standard Deviation": null,
+                "Rank": 25
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "claude-2.0",
+            "organization": "Anthropic",
+            "license": "Proprietary",
+            "knowledge_cutoff": "Unknown"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 33.53990717968659,
+                "Standard Deviation": 7.640386327990536,
+                "Rank": 41
+            },
+            "Geometry": {
+                "Average Score": 38.40953902052666,
+                "Standard Deviation": null,
+                "Rank": 29
+            },
+            "Algebra": {
+                "Average Score": 49.07235259762855,
+                "Standard Deviation": null,
+                "Rank": 33
+            },
+            "Probability": {
+                "Average Score": 46.71546649299419,
+                "Standard Deviation": null,
+                "Rank": 28
+            },
+            "Logical": {
+                "Average Score": 56.26908965013192,
+                "Standard Deviation": null,
+                "Rank": 27
+            },
+            "Social": {
+                "Average Score": 47.84034165469707,
+                "Standard Deviation": null,
+                "Rank": 23
+            },
+            "Chemistry": {
+                "Average Score": 55.20362543510563,
+                "Standard Deviation": null,
+                "Rank": 22
+            },
+            "CPP": {
+                "Average Score": 50.773143448036464,
+                "Standard Deviation": null,
+                "Rank": 19
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "starling-lm-7b-beta",
+            "organization": "Nexusflow",
+            "license": "Apache-2.0",
+            "knowledge_cutoff": "2024/03"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 50.90398580969381,
+                "Standard Deviation": 0.2839403187065694,
+                "Rank": 27
+            },
+            "Geometry": {
+                "Average Score": 34.653904247826965,
+                "Standard Deviation": null,
+                "Rank": 33
+            },
+            "Algebra": {
+                "Average Score": 49.66265150940668,
+                "Standard Deviation": null,
+                "Rank": 32
+            },
+            "Probability": {
+                "Average Score": 40.04695085773174,
+                "Standard Deviation": null,
+                "Rank": 32
+            },
+            "Logical": {
+                "Average Score": 48.02284849364292,
+                "Standard Deviation": null,
+                "Rank": 29
+            },
+            "Social": {
+                "Average Score": 42.82322308642107,
+                "Standard Deviation": null,
+                "Rank": 29
+            },
+            "Chemistry": {
+                "Average Score": 40.54467030566931,
+                "Standard Deviation": null,
+                "Rank": 35
+            },
+            "CPP": {
+                "Average Score": 38.27587102395908,
+                "Standard Deviation": null,
+                "Rank": 32
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemini-1.0-pro-001",
+            "organization": "Google",
+            "license": "Proprietary",
+            "knowledge_cutoff": "2023/04"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 37.91102687366529,
+                "Standard Deviation": 15.15111885239772,
+                "Rank": 38
+            },
+            "Geometry": {
+                "Average Score": 35.480853719259684,
+                "Standard Deviation": null,
+                "Rank": 32
+            },
+            "Algebra": {
+                "Average Score": 48.08542847805497,
+                "Standard Deviation": null,
+                "Rank": 35
+            },
+            "Probability": {
+                "Average Score": 29.862669786973395,
+                "Standard Deviation": null,
+                "Rank": 42
+            },
+            "Logical": {
+                "Average Score": 24.141794297157134,
+                "Standard Deviation": null,
+                "Rank": 43
+            },
+            "Social": {
+                "Average Score": 15.062345665891504,
+                "Standard Deviation": null,
+                "Rank": 51
+            },
+            "Chemistry": {
+                "Average Score": 46.52522766257804,
+                "Standard Deviation": null,
+                "Rank": 28
+            },
+            "CPP": {
+                "Average Score": 45.22204471452975,
+                "Standard Deviation": null,
+                "Rank": 23
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "openchat-3.5-0106",
+            "organization": "OpenChat",
+            "license": "Apache-2.0",
+            "knowledge_cutoff": "2024/01"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 41.34314082389491,
+                "Standard Deviation": 4.394481877390224,
+                "Rank": 32
+            },
+            "Geometry": {
+                "Average Score": 29.859015723426758,
+                "Standard Deviation": null,
+                "Rank": 36
+            },
+            "Algebra": {
+                "Average Score": 45.79428201943078,
+                "Standard Deviation": null,
+                "Rank": 36
+            },
+            "Probability": {
+                "Average Score": 38.766888608782956,
+                "Standard Deviation": null,
+                "Rank": 34
+            },
+            "Logical": {
+                "Average Score": 42.1345774485532,
+                "Standard Deviation": null,
+                "Rank": 32
+            },
+            "Social": {
+                "Average Score": 32.07155544930587,
+                "Standard Deviation": null,
+                "Rank": 39
+            },
+            "Chemistry": {
+                "Average Score": 35.28601797606463,
+                "Standard Deviation": null,
+                "Rank": 37
+            },
+            "CPP": {
+                "Average Score": 33.70639271807677,
+                "Standard Deviation": null,
+                "Rank": 33
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "openchat-3.5",
+            "organization": "OpenChat",
+            "license": "Apache-2.0",
+            "knowledge_cutoff": "2023/11"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 39.60454188051808,
+                "Standard Deviation": 0.8232501722386516,
+                "Rank": 36
+            },
+            "Geometry": {
+                "Average Score": 30.77657388742533,
+                "Standard Deviation": null,
+                "Rank": 35
+            },
+            "Algebra": {
+                "Average Score": 42.13028451761782,
+                "Standard Deviation": null,
+                "Rank": 38
+            },
+            "Probability": {
+                "Average Score": 34.817635171077754,
+                "Standard Deviation": null,
+                "Rank": 37
+            },
+            "Logical": {
+                "Average Score": 36.21944706732088,
+                "Standard Deviation": null,
+                "Rank": 36
+            },
+            "Social": {
+                "Average Score": 37.59265084241427,
+                "Standard Deviation": null,
+                "Rank": 33
+            },
+            "Chemistry": {
+                "Average Score": 37.21911183748652,
+                "Standard Deviation": null,
+                "Rank": 36
+            },
+            "CPP": {
+                "Average Score": 33.020911255646965,
+                "Standard Deviation": null,
+                "Rank": 34
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "command-r-(08-2024)",
+            "organization": "Cohere",
+            "license": "CC-BY-NC-4.0",
+            "knowledge_cutoff": "2024/08"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 45.84310421663912,
+                "Standard Deviation": 0.14535750785421472,
+                "Rank": 29
+            },
+            "Geometry": {
+                "Average Score": 36.33550343578038,
+                "Standard Deviation": null,
+                "Rank": 31
+            },
+            "Algebra": {
+                "Average Score": 41.87079446639028,
+                "Standard Deviation": null,
+                "Rank": 39
+            },
+            "Probability": {
+                "Average Score": 36.87662939858684,
+                "Standard Deviation": null,
+                "Rank": 36
+            },
+            "Logical": {
+                "Average Score": 26.22482921268266,
+                "Standard Deviation": null,
+                "Rank": 41
+            },
+            "Social": {
+                "Average Score": 35.11019761697373,
+                "Standard Deviation": null,
+                "Rank": 35
+            },
+            "Chemistry": {
+                "Average Score": 41.81772722027254,
+                "Standard Deviation": null,
+                "Rank": 33
+            },
+            "CPP": {
+                "Average Score": 39.61492485677676,
+                "Standard Deviation": null,
+                "Rank": 30
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemma-1.1-7b-it",
+            "organization": "Google",
+            "license": "Gemma License",
+            "knowledge_cutoff": "2024/02"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 35.873210924652795,
+                "Standard Deviation": 6.462625645064649,
+                "Rank": 37
+            },
+            "Geometry": {
+                "Average Score": 25.79207201693066,
+                "Standard Deviation": null,
+                "Rank": 40
+            },
+            "Algebra": {
+                "Average Score": 40.58046616460041,
+                "Standard Deviation": null,
+                "Rank": 40
+            },
+            "Probability": {
+                "Average Score": 29.581773053230897,
+                "Standard Deviation": null,
+                "Rank": 43
+            },
+            "Logical": {
+                "Average Score": 41.99821650962693,
+                "Standard Deviation": null,
+                "Rank": 33
+            },
+            "Social": {
+                "Average Score": 24.39015213949678,
+                "Standard Deviation": null,
+                "Rank": 43
+            },
+            "Chemistry": {
+                "Average Score": 45.01706482033765,
+                "Standard Deviation": null,
+                "Rank": 29
+            },
+            "CPP": {
+                "Average Score": 42.666504105798204,
+                "Standard Deviation": null,
+                "Rank": 27
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "llama3-8b-instruct",
+            "organization": "Meta",
+            "license": "Llama 3 Community",
+            "knowledge_cutoff": "2023/03"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 39.00917270775336,
+                "Standard Deviation": 3.999506140299149,
+                "Rank": 39
+            },
+            "Geometry": {
+                "Average Score": 29.224089668837465,
+                "Standard Deviation": null,
+                "Rank": 38
+            },
+            "Algebra": {
+                "Average Score": 42.90961619082775,
+                "Standard Deviation": null,
+                "Rank": 37
+            },
+            "Probability": {
+                "Average Score": 34.15721355738147,
+                "Standard Deviation": null,
+                "Rank": 38
+            },
+            "Logical": {
+                "Average Score": 58.39773915370141,
+                "Standard Deviation": null,
+                "Rank": 26
+            },
+            "Social": {
+                "Average Score": 40.88535401371015,
+                "Standard Deviation": null,
+                "Rank": 30
+            },
+            "Chemistry": {
+                "Average Score": 49.70839372661025,
+                "Standard Deviation": null,
+                "Rank": 25
+            },
+            "CPP": {
+                "Average Score": 45.35392139264795,
+                "Standard Deviation": null,
+                "Rank": 22
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemma-2-2b-it",
+            "organization": "Google",
+            "license": "Gemma License",
+            "knowledge_cutoff": "2024/07"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 57.45780847204313,
+                "Standard Deviation": 16.310023687014333,
+                "Rank": 22
+            },
+            "Geometry": {
+                "Average Score": 29.820233374501843,
+                "Standard Deviation": null,
+                "Rank": 36
+            },
+            "Algebra": {
+                "Average Score": 39.873024674507214,
+                "Standard Deviation": null,
+                "Rank": 41
+            },
+            "Probability": {
+                "Average Score": 31.85692359301203,
+                "Standard Deviation": null,
+                "Rank": 40
+            },
+            "Logical": {
+                "Average Score": 43.93437465788311,
+                "Standard Deviation": null,
+                "Rank": 30
+            },
+            "Social": {
+                "Average Score": 44.689420554662476,
+                "Standard Deviation": null,
+                "Rank": 27
+            },
+            "Chemistry": {
+                "Average Score": 32.05704364512495,
+                "Standard Deviation": null,
+                "Rank": 40
+            },
+            "CPP": {
+                "Average Score": 30.53406933106768,
+                "Standard Deviation": null,
+                "Rank": 36
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "starling-lm-7b-alpha",
+            "organization": "Nexusflow",
+            "license": "Apache-2.0",
+            "knowledge_cutoff": "2023/11"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 40.625443347641045,
+                "Standard Deviation": 3.0544259540377268,
+                "Rank": 34
+            },
+            "Geometry": {
+                "Average Score": 26.171147508308422,
+                "Standard Deviation": null,
+                "Rank": 39
+            },
+            "Algebra": {
+                "Average Score": 39.149463007523856,
+                "Standard Deviation": null,
+                "Rank": 42
+            },
+            "Probability": {
+                "Average Score": 32.36862021879827,
+                "Standard Deviation": null,
+                "Rank": 39
+            },
+            "Logical": {
+                "Average Score": 34.17344938419256,
+                "Standard Deviation": null,
+                "Rank": 39
+            },
+            "Social": {
+                "Average Score": 35.06966333212518,
+                "Standard Deviation": null,
+                "Rank": 35
+            },
+            "Chemistry": {
+                "Average Score": 32.15932739848045,
+                "Standard Deviation": null,
+                "Rank": 39
+            },
+            "CPP": {
+                "Average Score": 30.07926487356878,
+                "Standard Deviation": null,
+                "Rank": 37
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "qwen1.5-4b-chat",
+            "organization": "Alibaba",
+            "license": "Qianwen LICENSE",
+            "knowledge_cutoff": "2024/02"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 11.723779019126527,
+                "Standard Deviation": 0.856230353584155,
+                "Rank": 53
+            },
+            "Geometry": {
+                "Average Score": 16.072772563608115,
+                "Standard Deviation": null,
+                "Rank": 45
+            },
+            "Algebra": {
+                "Average Score": 32.22626131587612,
+                "Standard Deviation": null,
+                "Rank": 44
+            },
+            "Probability": {
+                "Average Score": 13.98282712349133,
+                "Standard Deviation": null,
+                "Rank": 48
+            },
+            "Logical": {
+                "Average Score": 13.993097991375581,
+                "Standard Deviation": null,
+                "Rank": 51
+            },
+            "Social": {
+                "Average Score": 22.955898106386442,
+                "Standard Deviation": null,
+                "Rank": 45
+            },
+            "Chemistry": {
+                "Average Score": 13.907481529463642,
+                "Standard Deviation": null,
+                "Rank": 51
+            },
+            "CPP": {
+                "Average Score": 13.21208067122554,
+                "Standard Deviation": null,
+                "Rank": 47
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "command-r-(04-2024)",
+            "organization": "Cohere",
+            "license": "CC-BY-NC-4.0",
+            "knowledge_cutoff": "2024/04"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 43.08187135994592,
+                "Standard Deviation": 0.7654553730614279,
+                "Rank": 33
+            },
+            "Geometry": {
+                "Average Score": 24.037084801508428,
+                "Standard Deviation": null,
+                "Rank": 41
+            },
+            "Algebra": {
+                "Average Score": 32.37474440275246,
+                "Standard Deviation": null,
+                "Rank": 43
+            },
+            "Probability": {
+                "Average Score": 31.014039425232298,
+                "Standard Deviation": null,
+                "Rank": 41
+            },
+            "Logical": {
+                "Average Score": 35.49507014348235,
+                "Standard Deviation": null,
+                "Rank": 37
+            },
+            "Social": {
+                "Average Score": 34.782695172510856,
+                "Standard Deviation": null,
+                "Rank": 37
+            },
+            "Chemistry": {
+                "Average Score": 42.46395478814961,
+                "Standard Deviation": null,
+                "Rank": 32
+            },
+            "CPP": {
+                "Average Score": 41.346336503003236,
+                "Standard Deviation": null,
+                "Rank": 28
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "vicuna-33b",
+            "organization": "LMSYS",
+            "license": "Non-commercial",
+            "knowledge_cutoff": "2023/08"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 30.8582386682731,
+                "Standard Deviation": 2.3851186735858945,
+                "Rank": 42
+            },
+            "Geometry": {
+                "Average Score": 17.058968577112452,
+                "Standard Deviation": null,
+                "Rank": 44
+            },
+            "Algebra": {
+                "Average Score": 25.22004544023738,
+                "Standard Deviation": null,
+                "Rank": 45
+            },
+            "Probability": {
+                "Average Score": 21.097169680647767,
+                "Standard Deviation": null,
+                "Rank": 46
+            },
+            "Logical": {
+                "Average Score": 23.212667585279515,
+                "Standard Deviation": null,
+                "Rank": 45
+            },
+            "Social": {
+                "Average Score": 32.357116321848025,
+                "Standard Deviation": null,
+                "Rank": 38
+            },
+            "Chemistry": {
+                "Average Score": 29.376389899632898,
+                "Standard Deviation": null,
+                "Rank": 42
+            },
+            "CPP": {
+                "Average Score": 28.01838653090379,
+                "Standard Deviation": null,
+                "Rank": 38
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemma-7b-it",
+            "organization": "Google",
+            "license": "Gemma License",
+            "knowledge_cutoff": "2024/02"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 27.609692676933715,
+                "Standard Deviation": 5.8350892031427435,
+                "Rank": 45
+            },
+            "Geometry": {
+                "Average Score": 20.127802528542947,
+                "Standard Deviation": null,
+                "Rank": 42
+            },
+            "Algebra": {
+                "Average Score": 23.46400816161807,
+                "Standard Deviation": null,
+                "Rank": 47
+            },
+            "Probability": {
+                "Average Score": 17.139514453170445,
+                "Standard Deviation": null,
+                "Rank": 47
+            },
+            "Logical": {
+                "Average Score": 24.625290351028372,
+                "Standard Deviation": null,
+                "Rank": 42
+            },
+            "Social": {
+                "Average Score": 26.715025606557614,
+                "Standard Deviation": null,
+                "Rank": 42
+            },
+            "Chemistry": {
+                "Average Score": 29.383105099269972,
+                "Standard Deviation": null,
+                "Rank": 41
+            },
+            "CPP": {
+                "Average Score": 28.014658234926813,
+                "Standard Deviation": null,
+                "Rank": 39
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "mistral-7b-instruct-2",
+            "organization": "Mistral",
+            "license": "Apache 2.0",
+            "knowledge_cutoff": "2023/12"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 32.583755237895794,
+                "Standard Deviation": 1.6860156811686553,
+                "Rank": 40
+            },
+            "Geometry": {
+                "Average Score": 17.27716649229315,
+                "Standard Deviation": null,
+                "Rank": 43
+            },
+            "Algebra": {
+                "Average Score": 23.58916877939791,
+                "Standard Deviation": null,
+                "Rank": 46
+            },
+            "Probability": {
+                "Average Score": 25.1012270940144,
+                "Standard Deviation": null,
+                "Rank": 44
+            },
+            "Logical": {
+                "Average Score": 29.07002036532878,
+                "Standard Deviation": null,
+                "Rank": 40
+            },
+            "Social": {
+                "Average Score": 24.39006275978174,
+                "Standard Deviation": null,
+                "Rank": 43
+            },
+            "Chemistry": {
+                "Average Score": 32.76096708662236,
+                "Standard Deviation": null,
+                "Rank": 38
+            },
+            "CPP": {
+                "Average Score": 31.382959631870822,
+                "Standard Deviation": null,
+                "Rank": 35
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "mistral-7b-instruct-1",
+            "organization": "Mistral",
+            "license": "Apache 2.0",
+            "knowledge_cutoff": "2023/12"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 22.167930858422395,
+                "Standard Deviation": 3.328543828571604,
+                "Rank": 50
+            },
+            "Geometry": {
+                "Average Score": 11.300762460776488,
+                "Standard Deviation": null,
+                "Rank": 49
+            },
+            "Algebra": {
+                "Average Score": 21.016466430115493,
+                "Standard Deviation": null,
+                "Rank": 48
+            },
+            "Probability": {
+                "Average Score": 24.506863192031716,
+                "Standard Deviation": null,
+                "Rank": 45
+            },
+            "Logical": {
+                "Average Score": 17.0066100312336,
+                "Standard Deviation": null,
+                "Rank": 49
+            },
+            "Social": {
+                "Average Score": 14.049392081101905,
+                "Standard Deviation": null,
+                "Rank": 52
+            },
+            "Chemistry": {
+                "Average Score": 20.796521445473058,
+                "Standard Deviation": null,
+                "Rank": 45
+            },
+            "CPP": {
+                "Average Score": 18.929093202755805,
+                "Standard Deviation": null,
+                "Rank": 42
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "vicuna-13b",
+            "organization": "LMSYS",
+            "license": "Non-commercial",
+            "knowledge_cutoff": "2023/07"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 20.105123059326157,
+                "Standard Deviation": 4.100609090750239,
+                "Rank": 51
+            },
+            "Geometry": {
+                "Average Score": 13.080654946737525,
+                "Standard Deviation": null,
+                "Rank": 48
+            },
+            "Algebra": {
+                "Average Score": 20.125194674408167,
+                "Standard Deviation": null,
+                "Rank": 49
+            },
+            "Probability": {
+                "Average Score": 13.125942598704368,
+                "Standard Deviation": null,
+                "Rank": 49
+            },
+            "Logical": {
+                "Average Score": 17.182300978389822,
+                "Standard Deviation": null,
+                "Rank": 48
+            },
+            "Social": {
+                "Average Score": 16.258399348520832,
+                "Standard Deviation": null,
+                "Rank": 50
+            },
+            "Chemistry": {
+                "Average Score": 23.79065696739089,
+                "Standard Deviation": null,
+                "Rank": 44
+            },
+            "CPP": {
+                "Average Score": 21.840013221590294,
+                "Standard Deviation": null,
+                "Rank": 40
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "zephyr-7b-beta",
+            "organization": "HuggingFace",
+            "license": "MIT",
+            "knowledge_cutoff": "2023/10"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 11.581258432641418,
+                "Standard Deviation": 1.677081510212375,
+                "Rank": 54
+            },
+            "Geometry": {
+                "Average Score": 8.432624521698594,
+                "Standard Deviation": null,
+                "Rank": 50
+            },
+            "Algebra": {
+                "Average Score": 12.912859660357217,
+                "Standard Deviation": null,
+                "Rank": 51
+            },
+            "Probability": {
+                "Average Score": 7.643552619113196,
+                "Standard Deviation": null,
+                "Rank": 54
+            },
+            "Logical": {
+                "Average Score": 7.444095116649809,
+                "Standard Deviation": null,
+                "Rank": 55
+            },
+            "Social": {
+                "Average Score": 0.0,
+                "Standard Deviation": null,
+                "Rank": 57
+            },
+            "Chemistry": {
+                "Average Score": 16.150157007299235,
+                "Standard Deviation": null,
+                "Rank": 49
+            },
+            "CPP": {
+                "Average Score": 18.92902220864132,
+                "Standard Deviation": null,
+                "Rank": 43
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemma-1.1-2b-it",
+            "organization": "Google",
+            "license": "Gemma License",
+            "knowledge_cutoff": "2024/02"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 25.06653151900311,
+                "Standard Deviation": 5.340973431345662,
+                "Rank": 48
+            },
+            "Geometry": {
+                "Average Score": 13.161686218568628,
+                "Standard Deviation": null,
+                "Rank": 47
+            },
+            "Algebra": {
+                "Average Score": 15.592205919293873,
+                "Standard Deviation": null,
+                "Rank": 50
+            },
+            "Probability": {
+                "Average Score": 8.305764696120711,
+                "Standard Deviation": null,
+                "Rank": 51
+            },
+            "Logical": {
+                "Average Score": 10.940766703849592,
+                "Standard Deviation": null,
+                "Rank": 53
+            },
+            "Social": {
+                "Average Score": 21.925546766366356,
+                "Standard Deviation": null,
+                "Rank": 46
+            },
+            "Chemistry": {
+                "Average Score": 18.700936936742952,
+                "Standard Deviation": null,
+                "Rank": 46
+            },
+            "CPP": {
+                "Average Score": 20.724691953843916,
+                "Standard Deviation": null,
+                "Rank": 41
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "llama2-7b-chat",
+            "organization": "Meta",
+            "license": "Llama 2 Community",
+            "knowledge_cutoff": "2023/07"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 25.633612357313762,
+                "Standard Deviation": 2.805639153654191,
+                "Rank": 46
+            },
+            "Geometry": {
+                "Average Score": 5.825877827672446,
+                "Standard Deviation": null,
+                "Rank": 51
+            },
+            "Algebra": {
+                "Average Score": 8.58657284915635,
+                "Standard Deviation": null,
+                "Rank": 53
+            },
+            "Probability": {
+                "Average Score": 8.164826137672431,
+                "Standard Deviation": null,
+                "Rank": 53
+            },
+            "Logical": {
+                "Average Score": 20.697630462723275,
+                "Standard Deviation": null,
+                "Rank": 47
+            },
+            "Social": {
+                "Average Score": 18.13821609304045,
+                "Standard Deviation": null,
+                "Rank": 47
+            },
+            "Chemistry": {
+                "Average Score": 17.065363968846427,
+                "Standard Deviation": null,
+                "Rank": 47
+            },
+            "CPP": {
+                "Average Score": 15.730513733660898,
+                "Standard Deviation": null,
+                "Rank": 45
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "gemma-2b-it",
+            "organization": "Google",
+            "license": "Gemma License",
+            "knowledge_cutoff": "2024/02"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 22.935122315202772,
+                "Standard Deviation": 1.9451357494738446,
+                "Rank": 49
+            },
+            "Geometry": {
+                "Average Score": 15.523844579555126,
+                "Standard Deviation": null,
+                "Rank": 46
+            },
+            "Algebra": {
+                "Average Score": 8.997563653883809,
+                "Standard Deviation": null,
+                "Rank": 52
+            },
+            "Probability": {
+                "Average Score": 6.750305898269558,
+                "Standard Deviation": null,
+                "Rank": 55
+            },
+            "Logical": {
+                "Average Score": 5.354222904092569,
+                "Standard Deviation": null,
+                "Rank": 56
+            },
+            "Social": {
+                "Average Score": 10.938132042877358,
+                "Standard Deviation": null,
+                "Rank": 54
+            },
+            "Chemistry": {
+                "Average Score": 17.06532733699507,
+                "Standard Deviation": null,
+                "Rank": 47
+            },
+            "CPP": {
+                "Average Score": 17.2715657115764,
+                "Standard Deviation": null,
+                "Rank": 44
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "llama2-13b-chat",
+            "organization": "Meta",
+            "license": "Llama 2 Community",
+            "knowledge_cutoff": "2023/07"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 25.828530292775856,
+                "Standard Deviation": 3.2503558704879296,
+                "Rank": 47
+            },
+            "Geometry": {
+                "Average Score": 4.119943280135397,
+                "Standard Deviation": null,
+                "Rank": 53
+            },
+            "Algebra": {
+                "Average Score": 6.355347828676415,
+                "Standard Deviation": null,
+                "Rank": 54
+            },
+            "Probability": {
+                "Average Score": 11.5585998384148,
+                "Standard Deviation": null,
+                "Rank": 50
+            },
+            "Logical": {
+                "Average Score": 24.172674067890938,
+                "Standard Deviation": null,
+                "Rank": 43
+            },
+            "Social": {
+                "Average Score": 17.850287642446094,
+                "Standard Deviation": null,
+                "Rank": 49
+            },
+            "Chemistry": {
+                "Average Score": 13.887442704655687,
+                "Standard Deviation": null,
+                "Rank": 52
+            },
+            "CPP": {
+                "Average Score": 13.17258252933903,
+                "Standard Deviation": null,
+                "Rank": 48
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "vicuna-7b",
+            "organization": "LMSYS",
+            "license": "Non-commercial",
+            "knowledge_cutoff": "2023/07"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 19.78471384913738,
+                "Standard Deviation": 3.7936645273402276,
+                "Rank": 52
+            },
+            "Geometry": {
+                "Average Score": 5.434763675792798,
+                "Standard Deviation": null,
+                "Rank": 52
+            },
+            "Algebra": {
+                "Average Score": 5.925959137419872,
+                "Standard Deviation": null,
+                "Rank": 55
+            },
+            "Probability": {
+                "Average Score": 8.30566475354697,
+                "Standard Deviation": null,
+                "Rank": 51
+            },
+            "Logical": {
+                "Average Score": 11.881223740003346,
+                "Standard Deviation": null,
+                "Rank": 52
+            },
+            "Social": {
+                "Average Score": 12.864677350128595,
+                "Standard Deviation": null,
+                "Rank": 53
+            },
+            "Chemistry": {
+                "Average Score": 14.187574975522333,
+                "Standard Deviation": null,
+                "Rank": 50
+            },
+            "CPP": {
+                "Average Score": 14.255194156624162,
+                "Standard Deviation": null,
+                "Rank": 46
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "koala-13b",
+            "organization": "UC Berkeley",
+            "license": "Non-commercial",
+            "knowledge_cutoff": "2023/04"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 10.216910767982592,
+                "Standard Deviation": 2.0597606260293655,
+                "Rank": 55
+            },
+            "Geometry": {
+                "Average Score": 0.1600118163292883,
+                "Standard Deviation": null,
+                "Rank": 54
+            },
+            "Algebra": {
+                "Average Score": 2.2219841274068948,
+                "Standard Deviation": null,
+                "Rank": 56
+            },
+            "Probability": {
+                "Average Score": 3.353938470588142,
+                "Standard Deviation": null,
+                "Rank": 56
+            },
+            "Logical": {
+                "Average Score": 8.24436273551765,
+                "Standard Deviation": null,
+                "Rank": 54
+            },
+            "Social": {
+                "Average Score": 10.96000067573448,
+                "Standard Deviation": null,
+                "Rank": 54
+            },
+            "Chemistry": {
+                "Average Score": 6.272570799004611,
+                "Standard Deviation": null,
+                "Rank": 53
+            },
+            "CPP": {
+                "Average Score": 6.36433272373514,
+                "Standard Deviation": null,
+                "Rank": 49
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "openassistant-pythia-12b",
+            "organization": "OpenAssistant",
+            "license": "Non-commercial",
+            "knowledge_cutoff": "2023/04"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 0.0,
+                "Standard Deviation": 0.0,
+                "Rank": 56
+            },
+            "Geometry": {
+                "Average Score": 0.0,
+                "Standard Deviation": null,
+                "Rank": 55
+            },
+            "Algebra": {
+                "Average Score": 0.0,
+                "Standard Deviation": null,
+                "Rank": 57
+            },
+            "Probability": {
+                "Average Score": 0.0,
+                "Standard Deviation": null,
+                "Rank": 57
+            },
+            "Logical": {
+                "Average Score": 0.0,
+                "Standard Deviation": null,
+                "Rank": 57
+            },
+            "Social": {
+                "Average Score": 1.859688217710296,
+                "Standard Deviation": null,
+                "Rank": 56
+            },
+            "Chemistry": {
+                "Average Score": 0.0,
+                "Standard Deviation": null,
+                "Rank": 54
+            },
+            "CPP": {
+                "Average Score": 0.0,
+                "Standard Deviation": null,
+                "Rank": 50
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "nemotron-70b",
+            "organization": "NVIDIA",
+            "license": "Unknown",
+            "knowledge_cutoff": "Unknown"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 100.0,
+                "Standard Deviation": 0.0,
+                "Rank": 1
+            },
+            "Geometry": {
+                "Average Score": 68.72757963233221,
+                "Standard Deviation": null,
+                "Rank": 12
+            },
+            "Algebra": {
+                "Average Score": 73.71625129267943,
+                "Standard Deviation": null,
+                "Rank": 16
+            },
+            "Chemistry": {
+                "Average Score": 72.48678626772566,
+                "Standard Deviation": null,
+                "Rank": 14
+            },
+            "Logical": {
+                "Average Score": 92.57864400540329,
+                "Standard Deviation": null,
+                "Rank": 5
+            },
+            "Social": {
+                "Average Score": 99.63342284899149,
+                "Standard Deviation": null,
+                "Rank": 2
+            },
+            "Probability": {
+                "Average Score": 75.30735899300154,
+                "Standard Deviation": null,
+                "Rank": 10
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "llama-3.2-3b-it",
+            "organization": "Meta",
+            "license": "Llama 3 Community",
+            "knowledge_cutoff": "Unknown"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 29.47099904114387,
+                "Standard Deviation": 1.6836027650802912,
+                "Rank": 43
+            },
+            "Geometry": {
+                "Average Score": 0.0,
+                "Standard Deviation": 0.0,
+                "Rank": 50
+            },
+            "Algebra": {
+                "Average Score": 55.31592410564261,
+                "Standard Deviation": null,
+                "Rank": 29
+            },
+            "Chemistry": {
+                "Average Score": 28.667640602193643,
+                "Standard Deviation": null,
+                "Rank": 43
+            },
+            "Logical": {
+                "Average Score": 15.35430947415723,
+                "Standard Deviation": null,
+                "Rank": 49
+            },
+            "Social": {
+                "Average Score": 18.087938295545133,
+                "Standard Deviation": null,
+                "Rank": 48
+            },
+            "Probability": {
+                "Average Score": 37.84631410688676,
+                "Standard Deviation": null,
+                "Rank": 35
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "yi-lightning",
+            "organization": "01 AI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "Unknown"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 96.10303362688546,
+                "Standard Deviation": 0.5365246195716372,
+                "Rank": 3
+            },
+            "Geometry": {
+                "Average Score": 77.09570683128703,
+                "Standard Deviation": null,
+                "Rank": 8
+            },
+            "Algebra": {
+                "Average Score": 85.92132293392635,
+                "Standard Deviation": null,
+                "Rank": 6
+            },
+            "Chemistry": {
+                "Average Score": 95.7205664118507,
+                "Standard Deviation": null,
+                "Rank": 2
+            },
+            "Logical": {
+                "Average Score": 94.60171867702756,
+                "Standard Deviation": null,
+                "Rank": 4
+            },
+            "Social": {
+                "Average Score": 93.93680225135506,
+                "Standard Deviation": null,
+                "Rank": 6
+            },
+            "Probability": {
+                "Average Score": 90.23858748317501,
+                "Standard Deviation": null,
+                "Rank": 3
+            }
+        }
+    },
+    {
+        "config": {
+            "model_name": "glm-4-plus",
+            "organization": "Zhipu AI",
+            "license": "Proprietary",
+            "knowledge_cutoff": "Unknown"
+        },
+        "results": {
+            "OVERALL": {
+                "Average Score": 90.50303579501356,
+                "Standard Deviation": 5.202472970969946,
+                "Rank": 6
+            },
+            "Geometry": {
+                "Average Score": 76.37543021571776,
+                "Standard Deviation": null,
+                "Rank": 9
+            },
+            "Algebra": {
+                "Average Score": 81.39859078752944,
+                "Standard Deviation": null,
+                "Rank": 10
+            },
+            "Chemistry": {
+                "Average Score": 90.15506569759444,
+                "Standard Deviation": null,
+                "Rank": 6
+            },
+            "Logical": {
+                "Average Score": 92.26403821208403,
+                "Standard Deviation": null,
+                "Rank": 6
+            },
+            "Social": {
+                "Average Score": 100.0,
+                "Standard Deviation": null,
+                "Rank": 1
+            },
+            "Probability": {
+                "Average Score": 73.99418447190348,
+                "Standard Deviation": null,
+                "Rank": 11
+            }
+        }
+    }
+]