Spaces:

ricdomolm
/

caselawqa_leaderboard

Restarting

App Files Files Community

Nathan Habib commited on Nov 9, 2023

Commit

e3aaf53

•

1 Parent(s): 26286b2

add new evals to the leaderboard

Browse files

Files changed (5) hide show

app.py +22 -19
src/assets/hardcoded_evals.py +3 -0
src/assets/text_content.py +53 -1
src/get_model_info/utils.py +3 -0
src/plots/read_results.py +6 -3

app.py CHANGED Viewed

@@ -88,6 +88,9 @@ BENCHMARK_COLS = [
         AutoEvalColumn.hellaswag,
         AutoEvalColumn.mmlu,
         AutoEvalColumn.truthfulqa,
     ]
 ]
@@ -107,7 +110,7 @@ update_collections(original_df.copy())
 leaderboard_df = original_df.copy()
 models = original_df["model_name_for_query"].tolist()  # needed for model backlinks in their to the leaderboard
-plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
 to_be_dumped = f"models = {repr(models)}\n"
 (
@@ -516,24 +519,24 @@ with demo:
                 queue=True,
             )
-        with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
-            with gr.Row():
-                with gr.Column():
-                    chart = create_metric_plot_obj(
-                        plot_df,
-                        ["Average ⬆️"],
-                        HUMAN_BASELINES,
-                        title="Average of Top Scores and Human Baseline Over Time",
-                    )
-                    gr.Plot(value=chart, interactive=False, width=500, height=500)
-                with gr.Column():
-                    chart = create_metric_plot_obj(
-                        plot_df,
-                        ["ARC", "HellaSwag", "MMLU", "TruthfulQA"],
-                        HUMAN_BASELINES,
-                        title="Top Scores and Human Baseline Over Time",
-                    )
-                    gr.Plot(value=chart, interactive=False, width=500, height=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

         AutoEvalColumn.hellaswag,
         AutoEvalColumn.mmlu,
         AutoEvalColumn.truthfulqa,
+        AutoEvalColumn.winogrande,
+        AutoEvalColumn.gsm8k,
+        AutoEvalColumn.drop
     ]
 ]
 leaderboard_df = original_df.copy()
 models = original_df["model_name_for_query"].tolist()  # needed for model backlinks in their to the leaderboard
+#plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
 to_be_dumped = f"models = {repr(models)}\n"
 (
                 queue=True,
             )
+        # with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
+        #     with gr.Row():
+        #         with gr.Column():
+        #             chart = create_metric_plot_obj(
+        #                 plot_df,
+        #                 ["Average ⬆️"],
+        #                 HUMAN_BASELINES,
+        #                 title="Average of Top Scores and Human Baseline Over Time",
+        #             )
+        #             gr.Plot(value=chart, interactive=False, width=500, height=500)
+        #         with gr.Column():
+        #             chart = create_metric_plot_obj(
+        #                 plot_df,
+        #                 ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K", "DROP"],
+        #                 HUMAN_BASELINES,
+        #                 title="Top Scores and Human Baseline Over Time",
+        #             )
+        #             gr.Plot(value=chart, interactive=False, width=500, height=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

src/assets/hardcoded_evals.py CHANGED Viewed

@@ -35,6 +35,9 @@ baseline = {
     AutoEvalColumn.hellaswag.name: 25.0,
     AutoEvalColumn.mmlu.name: 25.0,
     AutoEvalColumn.truthfulqa.name: 25.0,
     AutoEvalColumn.dummy.name: "baseline",
     AutoEvalColumn.model_type.name: "",
 }

     AutoEvalColumn.hellaswag.name: 25.0,
     AutoEvalColumn.mmlu.name: 25.0,
     AutoEvalColumn.truthfulqa.name: 25.0,
+    AutoEvalColumn.winogrande.name: 50.0,
+    AutoEvalColumn.gsm8k.name: 0.21,
+    AutoEvalColumn.drop.name: 0.47,
     AutoEvalColumn.dummy.name: "baseline",
     AutoEvalColumn.model_type.name: "",
 }

src/assets/text_content.py CHANGED Viewed

@@ -31,7 +31,10 @@ If there is no icon, we have not uploaded the information on the model yet, feel
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
-- <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
 For all these evaluations, a higher score is a better score.
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
@@ -55,6 +58,14 @@ The tasks and few shots parameters are:
 - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
 - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
 - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
 ## Quantization
 To get more information about quantization, see:
@@ -166,4 +177,45 @@ CITATION_BUTTON_TEXT = r"""
       eprint={2109.07958},
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }"""

 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
+- <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
+- <a href="https://arxiv.org/abs/1907.10641" target="_blank">  Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
+- <a href="https://arxiv.org/abs/2110.14168" target="_blank">  GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
+- <a href="https://arxiv.org/abs/1903.00161" target="_blank">  DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
 For all these evaluations, a higher score is a better score.
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
 - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
 - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
+- Winogrande: 5-shot, *winogrande* (`acc`)
+- GSM8k: 5-shot, *gsm8k* (`acc`)
+- DROP: 3-shot, *drop* (`f1`)
+Side note on the baseline scores:
+- for log-likelihood evaluation, we select the random baseline
+- for DROP, we select the best submission score according to [their leaderboard](https://leaderboard.allenai.org/drop/submissions/public) when the paper came out (NAQANet score)
+- for GSM8K, we select the score obtained in the paper after inetuning a 6B model on the full GSM8K training set for 50 epochs
 ## Quantization
 To get more information about quantization, see:
       eprint={2109.07958},
       archivePrefix={arXiv},
       primaryClass={cs.CL}
+}
+@misc{DBLP:journals/corr/abs-1907-10641,
+      title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
+      author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
+      year={2019},
+      eprint={1907.10641},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{DBLP:journals/corr/abs-2110-14168,
+      title={Training Verifiers to Solve Math Word Problems},
+      author={Karl Cobbe and
+                  Vineet Kosaraju and
+                  Mohammad Bavarian and
+                  Mark Chen and
+                  Heewoo Jun and
+                  Lukasz Kaiser and
+                  Matthias Plappert and
+                  Jerry Tworek and
+                  Jacob Hilton and
+                  Reiichiro Nakano and
+                  Christopher Hesse and
+                  John Schulman},
+      year={2021},
+      eprint={2110.14168},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{DBLP:journals/corr/abs-1903-00161,
+      title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
+                  Over Paragraphs},
+      author={Dheeru Dua and
+                  Yizhong Wang and
+                  Pradeep Dasigi and
+                  Gabriel Stanovsky and
+                  Sameer Singh and
+                  Matt Gardner},
+      year={2019},
+      eprinttype={arXiv},
+      eprint={1903.00161},
+      primaryClass={cs.CL}
 }"""

src/get_model_info/utils.py CHANGED Viewed

@@ -29,6 +29,9 @@ class AutoEvalColumn:  # Auto evals column
     hellaswag = ColumnContent("HellaSwag", "number", True)
     mmlu = ColumnContent("MMLU", "number", True)
     truthfulqa = ColumnContent("TruthfulQA", "number", True)
     model_type = ColumnContent("Type", "str", False)
     precision = ColumnContent("Precision", "str", False)  # , True)
     license = ColumnContent("Hub License", "str", False)

     hellaswag = ColumnContent("HellaSwag", "number", True)
     mmlu = ColumnContent("MMLU", "number", True)
     truthfulqa = ColumnContent("TruthfulQA", "number", True)
+    winogrande = ColumnContent("Winogrande", "number", True)
+    gsm8k = ColumnContent("GSM8K", "number", True)
+    drop = ColumnContent("DROP", "number", True)
     model_type = ColumnContent("Type", "str", False)
     precision = ColumnContent("Precision", "str", False)  # , True)
     license = ColumnContent("Hub License", "str", False)

src/plots/read_results.py CHANGED Viewed

@@ -8,13 +8,16 @@ import numpy as np
 from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
-METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
-BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
 BENCH_TO_NAME = {
     "arc:challenge": AutoEvalColumn.arc.name,
     "hellaswag": AutoEvalColumn.hellaswag.name,
     "hendrycksTest": AutoEvalColumn.mmlu.name,
     "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
 }
@@ -46,7 +49,7 @@ class EvalResult:
         data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         data_dict[AutoEvalColumn.dummy.name] = base_model
         data_dict[AutoEvalColumn.revision.name] = self.revision
-        data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
         data_dict[AutoEvalColumn.still_on_hub.name] = (
             is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
         )

 from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
+METRICS = ["acc_norm", "acc_norm", "acc", "mc2", "acc", "acc", "f1"]
+BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc", "winogrande", "gsm8k", "drop"]
 BENCH_TO_NAME = {
     "arc:challenge": AutoEvalColumn.arc.name,
     "hellaswag": AutoEvalColumn.hellaswag.name,
     "hendrycksTest": AutoEvalColumn.mmlu.name,
     "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
+    "winogrande": AutoEvalColumn.winogrande.name,
+    "gsm8k": AutoEvalColumn.gsm8k.name,
+    "drop": AutoEvalColumn.drop.name,
 }
         data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         data_dict[AutoEvalColumn.dummy.name] = base_model
         data_dict[AutoEvalColumn.revision.name] = self.revision
+        data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 7.0
         data_dict[AutoEvalColumn.still_on_hub.name] = (
             is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
         )