Adding Evaluation Results

This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr

The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.

If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions

Files changed (1) hide show

README.md +137 -27

README.md CHANGED Viewed

@@ -1,34 +1,13 @@
 ---
-language:
 - en
 - fr
 - ro
 - de
 - multilingual
 tags:
 - text2text-generation
-widget:
-- text: "Translate to German:  My name is Arthur"
-  example_title: "Translation"
-- text: "Please answer to the following question. Who is going to be the next Ballon d'or?"
-  example_title: "Question Answering"
-- text: "Q: Can Geoffrey Hinton have a conversation with George Washington? Give the rationale before answering."
-  example_title: "Logical reasoning"
-- text: "Please answer the following question. What is the boiling point of Nitrogen?"
-  example_title: "Scientific knowledge"
-- text: "Answer the following yes/no question. Can you write a whole Haiku in a single tweet?"
-  example_title: "Yes/no question"
-- text: "Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?"
-  example_title: "Reasoning task"
-- text: "Q: ( False or not False or False ) is? A: Let's think step by step"
-  example_title: "Boolean Expressions"
-- text: "The square root of x is the cube root of y. What is y to the power of 2, if x = 4?"
-  example_title: "Math reasoning"
-- text: "Premise:  At my age you will probably have learnt one lesson. Hypothesis:  It's not certain how many lessons you'll learn by your thirties. Does the premise entail the hypothesis?"
-  example_title: "Premise and hypothesis"
 datasets:
 - svakulenk0/qrecc
 - taskmaster2
@@ -40,9 +19,127 @@ datasets:
 - esnli
 - quasc
 - qed
-license: apache-2.0
 ---
 # Model Card for FLAN-T5 small
@@ -273,4 +370,17 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
   copyright = {Creative Commons Attribution 4.0 International}
 }
-```

 ---
+language:
 - en
 - fr
 - ro
 - de
 - multilingual
+license: apache-2.0
 tags:
 - text2text-generation
 datasets:
 - svakulenk0/qrecc
 - taskmaster2
 - esnli
 - quasc
 - qed
+widget:
+- text: 'Translate to German:  My name is Arthur'
+  example_title: Translation
+- text: Please answer to the following question. Who is going to be the next Ballon
+    d'or?
+  example_title: Question Answering
+- text: 'Q: Can Geoffrey Hinton have a conversation with George Washington? Give the
+    rationale before answering.'
+  example_title: Logical reasoning
+- text: Please answer the following question. What is the boiling point of Nitrogen?
+  example_title: Scientific knowledge
+- text: Answer the following yes/no question. Can you write a whole Haiku in a single
+    tweet?
+  example_title: Yes/no question
+- text: Answer the following yes/no question by reasoning step-by-step. Can you write
+    a whole Haiku in a single tweet?
+  example_title: Reasoning task
+- text: 'Q: ( False or not False or False ) is? A: Let''s think step by step'
+  example_title: Boolean Expressions
+- text: The square root of x is the cube root of y. What is y to the power of 2, if
+    x = 4?
+  example_title: Math reasoning
+- text: 'Premise:  At my age you will probably have learnt one lesson. Hypothesis:  It''s
+    not certain how many lessons you''ll learn by your thirties. Does the premise
+    entail the hypothesis?'
+  example_title: Premise and hypothesis
+model-index:
+- name: flan-t5-small
+  results:
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: IFEval (0-Shot)
+      type: HuggingFaceH4/ifeval
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: inst_level_strict_acc and prompt_level_strict_acc
+      value: 15.24
+      name: strict accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=google/flan-t5-small
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: BBH (3-Shot)
+      type: BBH
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc_norm
+      value: 6.36
+      name: normalized accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=google/flan-t5-small
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MATH Lvl 5 (4-Shot)
+      type: hendrycks/competition_math
+      args:
+        num_few_shot: 4
+    metrics:
+    - type: exact_match
+      value: 0.0
+      name: exact match
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=google/flan-t5-small
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: GPQA (0-shot)
+      type: Idavidrein/gpqa
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: acc_norm
+      value: 1.45
+      name: acc_norm
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=google/flan-t5-small
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MuSR (0-shot)
+      type: TAUR-Lab/MuSR
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: acc_norm
+      value: 10.37
+      name: acc_norm
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=google/flan-t5-small
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MMLU-PRO (5-shot)
+      type: TIGER-Lab/MMLU-Pro
+      config: main
+      split: test
+      args:
+        num_few_shot: 5
+    metrics:
+    - type: acc
+      value: 2.59
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=google/flan-t5-small
+      name: Open LLM Leaderboard
 ---
 # Model Card for FLAN-T5 small
   copyright = {Creative Commons Attribution 4.0 International}
 }
+```
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_google__flan-t5-small)
+|      Metric       |Value|
+|-------------------|----:|
+|Avg.               | 6.00|
+|IFEval (0-Shot)    |15.24|
+|BBH (3-Shot)       | 6.36|
+|MATH Lvl 5 (4-Shot)| 0.00|
+|GPQA (0-shot)      | 1.45|
+|MuSR (0-shot)      |10.37|
+|MMLU-PRO (5-shot)  | 2.59|