Adding Evaluation Results

This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr

The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.

If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions

Files changed (1) hide show

README.md +64 -47

README.md CHANGED Viewed

@@ -1,4 +1,6 @@
 ---
 license: mit
 library_name: peft
 tags:
@@ -6,12 +8,14 @@ tags:
 - sft
 - generated_from_trainer
 - pytorch
 base_model: microsoft/phi-2
 model-index:
 - name: phi-2-basic-maths
   results:
-  # AI2 Reasoning Challenge (25-Shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -22,15 +26,13 @@ model-index:
       args:
         num_few_shot: 25
     metrics:
-       - type: acc_norm
-         name: normalized accuracy
-         value: 55.80
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
-  # HellaSwag (10-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -40,15 +42,13 @@ model-index:
       args:
         num_few_shot: 10
     metrics:
-       - type: acc_norm
-         name: normalized accuracy
-         value: 71.15
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
-  # MMLU (5-Shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -59,15 +59,13 @@ model-index:
       args:
         num_few_shot: 5
     metrics:
-       - type: acc
-         name: accuracy
-         value: 47.27
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
-  # Winogrande (5-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -78,15 +76,13 @@ model-index:
       args:
         num_few_shot: 5
     metrics:
-       - type: acc
-         name: accuracy
-         value: 75.3
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
-  # truthfulqa (0-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -97,15 +93,13 @@ model-index:
       args:
         num_few_shot: 0
     metrics:
-       - type: mc2
-         name: mc2
-         value: 41.40
     source:
-      name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
-  # GSM8k (5-shot)
-  - task:
       type: text-generation
       name: Text Generation
     dataset:
@@ -116,18 +110,28 @@ model-index:
       args:
         num_few_shot: 5
     metrics:
-       - type: acc
-         name: accuracy
-         value: 30.70
     source:
       name: Open LLM Leaderboard
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
-datasets:
-- gsm8k
-language:
-- en
-pipeline_tag: text-generation
 ---
 # phi-2-basic-maths
@@ -196,4 +200,17 @@ Unclear answers: 7.81%
 - Transformers 4.38.0.dev0
 - Pytorch 2.1.0+cu121
 - Datasets 2.16.1
-- Tokenizers 0.15.1

 ---
+language:
+- en
 license: mit
 library_name: peft
 tags:
 - sft
 - generated_from_trainer
 - pytorch
+datasets:
+- gsm8k
 base_model: microsoft/phi-2
+pipeline_tag: text-generation
 model-index:
 - name: phi-2-basic-maths
   results:
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 25
     metrics:
+    - type: acc_norm
+      value: 55.8
+      name: normalized accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 10
     metrics:
+    - type: acc_norm
+      value: 71.15
+      name: normalized accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 5
     metrics:
+    - type: acc
+      value: 47.27
+      name: accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 5
     metrics:
+    - type: acc
+      value: 75.3
+      name: accuracy
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 0
     metrics:
+    - type: mc2
+      value: 41.4
+      name: mc2
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
+      name: Open LLM Leaderboard
+  - task:
       type: text-generation
       name: Text Generation
     dataset:
       args:
         num_few_shot: 5
     metrics:
+    - type: acc
+      value: 30.7
+      name: accuracy
     source:
+      url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
       name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: TruthfulQA (0-shot)
+      type: truthful_qa
+      config: multiple_choice
+      split: validation
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: mc2
+      value: 41.4
+    source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Menouar/phi-2-basic-maths
+      name: Open LLM Leaderboard
 ---
 # phi-2-basic-maths
 - Transformers 4.38.0.dev0
 - Pytorch 2.1.0+cu121
 - Datasets 2.16.1
+- Tokenizers 0.15.1
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_Menouar__phi-2-basic-maths)
+|             Metric              |Value|
+|---------------------------------|----:|
+|Avg.                             |53.60|
+|AI2 Reasoning Challenge (25-Shot)|55.80|
+|HellaSwag (10-Shot)              |71.15|
+|MMLU (5-Shot)                    |47.27|
+|TruthfulQA (0-shot)              |41.40|
+|Winogrande (5-shot)              |75.30|
+|GSM8k (5-shot)                   |30.71|