MaziyarPanahi's picture
Adding Evaluation Results (#2)
d80326d verified
---
license: apache-2.0
library_name: transformers
tags:
- merge
pipeline_tag: text-generation
model-index:
- name: TheTop-5x7B-Instruct-S4-v0.1
results:
- task:
type: text-generation
name: Text Generation
dataset:
name: AI2 Reasoning Challenge (25-Shot)
type: ai2_arc
config: ARC-Challenge
split: test
args:
num_few_shot: 25
metrics:
- type: acc_norm
value: 72.18
name: normalized accuracy
source:
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S4-v0.1
name: Open LLM Leaderboard
- task:
type: text-generation
name: Text Generation
dataset:
name: HellaSwag (10-Shot)
type: hellaswag
split: validation
args:
num_few_shot: 10
metrics:
- type: acc_norm
value: 88.29
name: normalized accuracy
source:
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S4-v0.1
name: Open LLM Leaderboard
- task:
type: text-generation
name: Text Generation
dataset:
name: MMLU (5-Shot)
type: cais/mmlu
config: all
split: test
args:
num_few_shot: 5
metrics:
- type: acc
value: 65.03
name: accuracy
source:
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S4-v0.1
name: Open LLM Leaderboard
- task:
type: text-generation
name: Text Generation
dataset:
name: TruthfulQA (0-shot)
type: truthful_qa
config: multiple_choice
split: validation
args:
num_few_shot: 0
metrics:
- type: mc2
value: 65.56
source:
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S4-v0.1
name: Open LLM Leaderboard
- task:
type: text-generation
name: Text Generation
dataset:
name: Winogrande (5-shot)
type: winogrande
config: winogrande_xl
split: validation
args:
num_few_shot: 5
metrics:
- type: acc
value: 85.16
name: accuracy
source:
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S4-v0.1
name: Open LLM Leaderboard
- task:
type: text-generation
name: Text Generation
dataset:
name: GSM8k (5-shot)
type: gsm8k
config: main
split: test
args:
num_few_shot: 5
metrics:
- type: acc
value: 73.39
name: accuracy
source:
url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=MaziyarPanahi/TheTop-5x7B-Instruct-S4-v0.1
name: Open LLM Leaderboard
---
Merge of top 7B models and the SLERP of other 7B models
> mergekit is a toolkit for merging pre-trained language models. mergekit uses an out-of-core approach to perform unreasonably elaborate merges in resource-constrained situations. Merges can be run entirely on CPU or accelerated with as little as 8 GB of VRAM. Many merging algorithms are supported, with more coming as they catch my attention.
>
> ## Eval
![image/png](https://cdn-uploads.huggingface.co/production/uploads/5fd5e18a90b6dc4633f6d292/3a2An3rpaLMusQrtQ74Up.png)
```python
{
"all": {
"acc": 0.6568351479800627,
"acc_stderr": 0.03199600851869088,
"acc_norm": 0.6554901222242155,
"acc_norm_stderr": 0.03267670432184765,
"mc1": 0.5104039167686658,
"mc1_stderr": 0.017499711430249268,
"mc2": 0.6556430108444109,
"mc2_stderr": 0.015519025079862213
},
"harness|arc:challenge|25": {
"acc": 0.6919795221843004,
"acc_stderr": 0.013491429517292038,
"acc_norm": 0.7218430034129693,
"acc_norm_stderr": 0.013094469919538812
},
"harness|hellaswag|10": {
"acc": 0.7202748456482773,
"acc_stderr": 0.0044794676194648,
"acc_norm": 0.8828918542123083,
"acc_norm_stderr": 0.003208919510309931
},
"harness|hendrycksTest-abstract_algebra|5": {
"acc": 0.33,
"acc_stderr": 0.047258156262526045,
"acc_norm": 0.33,
"acc_norm_stderr": 0.047258156262526045
},
"harness|hendrycksTest-anatomy|5": {
"acc": 0.6518518518518519,
"acc_stderr": 0.041153246103369526,
"acc_norm": 0.6518518518518519,
"acc_norm_stderr": 0.041153246103369526
},
"harness|hendrycksTest-astronomy|5": {
"acc": 0.7039473684210527,
"acc_stderr": 0.03715062154998904,
"acc_norm": 0.7039473684210527,
"acc_norm_stderr": 0.03715062154998904
},
"harness|hendrycksTest-business_ethics|5": {
"acc": 0.66,
"acc_stderr": 0.04760952285695238,
"acc_norm": 0.66,
"acc_norm_stderr": 0.04760952285695238
},
"harness|hendrycksTest-clinical_knowledge|5": {
"acc": 0.6981132075471698,
"acc_stderr": 0.02825420034443866,
"acc_norm": 0.6981132075471698,
"acc_norm_stderr": 0.02825420034443866
},
"harness|hendrycksTest-college_biology|5": {
"acc": 0.7708333333333334,
"acc_stderr": 0.03514697467862388,
"acc_norm": 0.7708333333333334,
"acc_norm_stderr": 0.03514697467862388
},
"harness|hendrycksTest-college_chemistry|5": {
"acc": 0.48,
"acc_stderr": 0.050211673156867795,
"acc_norm": 0.48,
"acc_norm_stderr": 0.050211673156867795
},
"harness|hendrycksTest-college_computer_science|5": {
"acc": 0.52,
"acc_stderr": 0.050211673156867795,
"acc_norm": 0.52,
"acc_norm_stderr": 0.050211673156867795
},
"harness|hendrycksTest-college_mathematics|5": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.27,
"acc_norm_stderr": 0.044619604333847394
},
"harness|hendrycksTest-college_medicine|5": {
"acc": 0.6705202312138728,
"acc_stderr": 0.03583901754736412,
"acc_norm": 0.6705202312138728,
"acc_norm_stderr": 0.03583901754736412
},
"harness|hendrycksTest-college_physics|5": {
"acc": 0.4019607843137255,
"acc_stderr": 0.04878608714466996,
"acc_norm": 0.4019607843137255,
"acc_norm_stderr": 0.04878608714466996
},
"harness|hendrycksTest-computer_security|5": {
"acc": 0.75,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.75,
"acc_norm_stderr": 0.04351941398892446
},
"harness|hendrycksTest-conceptual_physics|5": {
"acc": 0.5914893617021276,
"acc_stderr": 0.032134180267015755,
"acc_norm": 0.5914893617021276,
"acc_norm_stderr": 0.032134180267015755
},
"harness|hendrycksTest-econometrics|5": {
"acc": 0.5087719298245614,
"acc_stderr": 0.04702880432049615,
"acc_norm": 0.5087719298245614,
"acc_norm_stderr": 0.04702880432049615
},
"harness|hendrycksTest-electrical_engineering|5": {
"acc": 0.5724137931034483,
"acc_stderr": 0.04122737111370332,
"acc_norm": 0.5724137931034483,
"acc_norm_stderr": 0.04122737111370332
},
"harness|hendrycksTest-elementary_mathematics|5": {
"acc": 0.42592592592592593,
"acc_stderr": 0.02546714904546955,
"acc_norm": 0.42592592592592593,
"acc_norm_stderr": 0.02546714904546955
},
"harness|hendrycksTest-formal_logic|5": {
"acc": 0.49206349206349204,
"acc_stderr": 0.044715725362943486,
"acc_norm": 0.49206349206349204,
"acc_norm_stderr": 0.044715725362943486
},
"harness|hendrycksTest-global_facts|5": {
"acc": 0.37,
"acc_stderr": 0.04852365870939099,
"acc_norm": 0.37,
"acc_norm_stderr": 0.04852365870939099
},
"harness|hendrycksTest-high_school_biology|5": {
"acc": 0.7903225806451613,
"acc_stderr": 0.023157879349083525,
"acc_norm": 0.7903225806451613,
"acc_norm_stderr": 0.023157879349083525
},
"harness|hendrycksTest-high_school_chemistry|5": {
"acc": 0.5073891625615764,
"acc_stderr": 0.035176035403610105,
"acc_norm": 0.5073891625615764,
"acc_norm_stderr": 0.035176035403610105
},
"harness|hendrycksTest-high_school_computer_science|5": {
"acc": 0.66,
"acc_stderr": 0.04760952285695237,
"acc_norm": 0.66,
"acc_norm_stderr": 0.04760952285695237
},
"harness|hendrycksTest-high_school_european_history|5": {
"acc": 0.7757575757575758,
"acc_stderr": 0.03256866661681102,
"acc_norm": 0.7757575757575758,
"acc_norm_stderr": 0.03256866661681102
},
"harness|hendrycksTest-high_school_geography|5": {
"acc": 0.7929292929292929,
"acc_stderr": 0.028869778460267045,
"acc_norm": 0.7929292929292929,
"acc_norm_stderr": 0.028869778460267045
},
"harness|hendrycksTest-high_school_government_and_politics|5": {
"acc": 0.9067357512953368,
"acc_stderr": 0.020986854593289733,
"acc_norm": 0.9067357512953368,
"acc_norm_stderr": 0.020986854593289733
},
"harness|hendrycksTest-high_school_macroeconomics|5": {
"acc": 0.6666666666666666,
"acc_stderr": 0.023901157979402534,
"acc_norm": 0.6666666666666666,
"acc_norm_stderr": 0.023901157979402534
},
"harness|hendrycksTest-high_school_mathematics|5": {
"acc": 0.34814814814814815,
"acc_stderr": 0.02904560029061625,
"acc_norm": 0.34814814814814815,
"acc_norm_stderr": 0.02904560029061625
},
"harness|hendrycksTest-high_school_microeconomics|5": {
"acc": 0.6764705882352942,
"acc_stderr": 0.030388353551886793,
"acc_norm": 0.6764705882352942,
"acc_norm_stderr": 0.030388353551886793
},
"harness|hendrycksTest-high_school_physics|5": {
"acc": 0.36423841059602646,
"acc_stderr": 0.03929111781242742,
"acc_norm": 0.36423841059602646,
"acc_norm_stderr": 0.03929111781242742
},
"harness|hendrycksTest-high_school_psychology|5": {
"acc": 0.8366972477064221,
"acc_stderr": 0.01584825580650155,
"acc_norm": 0.8366972477064221,
"acc_norm_stderr": 0.01584825580650155
},
"harness|hendrycksTest-high_school_statistics|5": {
"acc": 0.5046296296296297,
"acc_stderr": 0.03409825519163572,
"acc_norm": 0.5046296296296297,
"acc_norm_stderr": 0.03409825519163572
},
"harness|hendrycksTest-high_school_us_history|5": {
"acc": 0.8529411764705882,
"acc_stderr": 0.024857478080250447,
"acc_norm": 0.8529411764705882,
"acc_norm_stderr": 0.024857478080250447
},
"harness|hendrycksTest-high_school_world_history|5": {
"acc": 0.8143459915611815,
"acc_stderr": 0.025310495376944856,
"acc_norm": 0.8143459915611815,
"acc_norm_stderr": 0.025310495376944856
},
"harness|hendrycksTest-human_aging|5": {
"acc": 0.6816143497757847,
"acc_stderr": 0.03126580522513713,
"acc_norm": 0.6816143497757847,
"acc_norm_stderr": 0.03126580522513713
},
"harness|hendrycksTest-human_sexuality|5": {
"acc": 0.7862595419847328,
"acc_stderr": 0.0359546161177469,
"acc_norm": 0.7862595419847328,
"acc_norm_stderr": 0.0359546161177469
},
"harness|hendrycksTest-international_law|5": {
"acc": 0.7933884297520661,
"acc_stderr": 0.03695980128098824,
"acc_norm": 0.7933884297520661,
"acc_norm_stderr": 0.03695980128098824
},
"harness|hendrycksTest-jurisprudence|5": {
"acc": 0.7870370370370371,
"acc_stderr": 0.0395783547198098,
"acc_norm": 0.7870370370370371,
"acc_norm_stderr": 0.0395783547198098
},
"harness|hendrycksTest-logical_fallacies|5": {
"acc": 0.7730061349693251,
"acc_stderr": 0.03291099578615769,
"acc_norm": 0.7730061349693251,
"acc_norm_stderr": 0.03291099578615769
},
"harness|hendrycksTest-machine_learning|5": {
"acc": 0.48214285714285715,
"acc_stderr": 0.047427623612430116,
"acc_norm": 0.48214285714285715,
"acc_norm_stderr": 0.047427623612430116
},
"harness|hendrycksTest-management|5": {
"acc": 0.7864077669902912,
"acc_stderr": 0.040580420156460344,
"acc_norm": 0.7864077669902912,
"acc_norm_stderr": 0.040580420156460344
},
"harness|hendrycksTest-marketing|5": {
"acc": 0.8803418803418803,
"acc_stderr": 0.021262719400406974,
"acc_norm": 0.8803418803418803,
"acc_norm_stderr": 0.021262719400406974
},
"harness|hendrycksTest-medical_genetics|5": {
"acc": 0.73,
"acc_stderr": 0.0446196043338474,
"acc_norm": 0.73,
"acc_norm_stderr": 0.0446196043338474
},
"harness|hendrycksTest-miscellaneous|5": {
"acc": 0.8275862068965517,
"acc_stderr": 0.013507943909371802,
"acc_norm": 0.8275862068965517,
"acc_norm_stderr": 0.013507943909371802
},
"harness|hendrycksTest-moral_disputes|5": {
"acc": 0.7543352601156069,
"acc_stderr": 0.023176298203992005,
"acc_norm": 0.7543352601156069,
"acc_norm_stderr": 0.023176298203992005
},
"harness|hendrycksTest-moral_scenarios|5": {
"acc": 0.45027932960893857,
"acc_stderr": 0.01663961523684581,
"acc_norm": 0.45027932960893857,
"acc_norm_stderr": 0.01663961523684581
},
"harness|hendrycksTest-nutrition|5": {
"acc": 0.7254901960784313,
"acc_stderr": 0.02555316999182652,
"acc_norm": 0.7254901960784313,
"acc_norm_stderr": 0.02555316999182652
},
"harness|hendrycksTest-philosophy|5": {
"acc": 0.7138263665594855,
"acc_stderr": 0.025670259242188933,
"acc_norm": 0.7138263665594855,
"acc_norm_stderr": 0.025670259242188933
},
"harness|hendrycksTest-prehistory|5": {
"acc": 0.7561728395061729,
"acc_stderr": 0.02389187954195961,
"acc_norm": 0.7561728395061729,
"acc_norm_stderr": 0.02389187954195961
},
"harness|hendrycksTest-professional_accounting|5": {
"acc": 0.46808510638297873,
"acc_stderr": 0.029766675075873866,
"acc_norm": 0.46808510638297873,
"acc_norm_stderr": 0.029766675075873866
},
"harness|hendrycksTest-professional_law|5": {
"acc": 0.4745762711864407,
"acc_stderr": 0.012753716929101004,
"acc_norm": 0.4745762711864407,
"acc_norm_stderr": 0.012753716929101004
},
"harness|hendrycksTest-professional_medicine|5": {
"acc": 0.6911764705882353,
"acc_stderr": 0.02806499816704009,
"acc_norm": 0.6911764705882353,
"acc_norm_stderr": 0.02806499816704009
},
"harness|hendrycksTest-professional_psychology|5": {
"acc": 0.6748366013071896,
"acc_stderr": 0.01895088677080631,
"acc_norm": 0.6748366013071896,
"acc_norm_stderr": 0.01895088677080631
},
"harness|hendrycksTest-public_relations|5": {
"acc": 0.6545454545454545,
"acc_stderr": 0.04554619617541054,
"acc_norm": 0.6545454545454545,
"acc_norm_stderr": 0.04554619617541054
},
"harness|hendrycksTest-security_studies|5": {
"acc": 0.7346938775510204,
"acc_stderr": 0.028263889943784603,
"acc_norm": 0.7346938775510204,
"acc_norm_stderr": 0.028263889943784603
},
"harness|hendrycksTest-sociology|5": {
"acc": 0.8258706467661692,
"acc_stderr": 0.026814951200421603,
"acc_norm": 0.8258706467661692,
"acc_norm_stderr": 0.026814951200421603
},
"harness|hendrycksTest-us_foreign_policy|5": {
"acc": 0.85,
"acc_stderr": 0.03588702812826371,
"acc_norm": 0.85,
"acc_norm_stderr": 0.03588702812826371
},
"harness|hendrycksTest-virology|5": {
"acc": 0.5602409638554217,
"acc_stderr": 0.03864139923699122,
"acc_norm": 0.5602409638554217,
"acc_norm_stderr": 0.03864139923699122
},
"harness|hendrycksTest-world_religions|5": {
"acc": 0.8421052631578947,
"acc_stderr": 0.027966785859160893,
"acc_norm": 0.8421052631578947,
"acc_norm_stderr": 0.027966785859160893
},
"harness|truthfulqa:mc|0": {
"mc1": 0.5104039167686658,
"mc1_stderr": 0.017499711430249268,
"mc2": 0.6556430108444109,
"mc2_stderr": 0.015519025079862213
},
"harness|winogrande|5": {
"acc": 0.8516179952644041,
"acc_stderr": 0.009990706005184136
},
"harness|gsm8k|5": {
"acc": 0.7338893100833965,
"acc_stderr": 0.012172750939040328
}
}
```
# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_MaziyarPanahi__TheTop-5x7B-Instruct-S4-v0.1)
| Metric |Value|
|---------------------------------|----:|
|Avg. |74.94|
|AI2 Reasoning Challenge (25-Shot)|72.18|
|HellaSwag (10-Shot) |88.29|
|MMLU (5-Shot) |65.03|
|TruthfulQA (0-shot) |65.56|
|Winogrande (5-shot) |85.16|
|GSM8k (5-shot) |73.39|