metadata
license: apache-2.0
pipeline_tag: text-generation
library_name: transformers
language:
- en
- am
- ar
- as
- az
- be
- bg
- bn
- br
- bs
- ca
- cs
- cy
- da
- de
- el
- eo
- es
- et
- eu
- fa
- ff
- fi
- fr
- fy
- ga
- gd
- gl
- gn
- gu
- ha
- he
- hi
- hr
- ht
- hu
- hy
- id
- ig
- is
- it
- ja
- jv
- ka
- kk
- km
- kn
- ko
- ku
- ky
- la
- lg
- li
- ln
- lo
- lt
- lv
- mg
- mk
- ml
- mn
- mr
- ms
- my
- ne
- nl
- 'no'
- ns
- om
- or
- pa
- pl
- ps
- pt
- qu
- rm
- ro
- ru
- sa
- si
- sc
- sd
- sk
- sl
- so
- sq
- sr
- ss
- su
- sv
- sw
- ta
- te
- th
- tl
- tn
- tr
- ug
- uk
- ur
- uz
- vi
- wo
- xh
- yi
- yo
- zu
datasets:
- yahma/alpaca-cleaned
- gbharti/wealth-alpaca_lora
- saillab/taco-datasets
- xu-song/cc100-samples
- ontocord/fineweb-permissive-multilingual-2m
- MuskumPillerum/General-Knowledge
- yirenc/general_knowledge_boolean
- nampdn-ai/tiny-textbooks
- nampdn-ai/tiny-codes
- bigcode/the-stack-smol-xs
- m-a-p/CodeFeedback-Filtered-Instruction
- jtatman/python-code-dataset-500k
- iamtarun/python_code_instructions_18k_alpaca
- HuggingFaceH4/CodeAlpaca_20K
- gair-prox/open-web-math-pro
- rvv-karma/Math-QA
- ajibawa-2023/Maths-College
- microsoft/orca-math-word-problems-200k
- fblgit/simple-math
- SkunkworksAI/reasoning-0.01
- badrex/llm-emoji-dataset
tags:
- litgpt
- litdata
tangled-llama-154m-32k-base-v0.1
A pretrained language model based on the Llama model with about 154M parameters. This model has been trained on 11.4B (11,422,750,857
) tokens from more than 0.8M (796,399
) dataset rows.
This model isn't designed for immediate use but rather for Continued Pretraining and Finetuning on a downstream task. While it can handle a context length of up to 128K (131,072
) tokens, it was pretrained with sequences of 2K (2048
) tokens.
The objective is to streamline the cognitive or reasoning core, eliminating any redundant knowledge from the model.
loss, val_loss
val_ppl
epoch
learning_rate
lm-evaluation-harness
litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
arc_challenge |
1 |
none |
0 |
acc |
↑ |
0.1877 |
± |
0.0114 |
|
|
none |
0 |
acc_norm |
↑ |
0.2389 |
± |
0.0125 |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.0136 |
± |
0.0032 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.0008 |
± |
0.0008 |
hellaswag |
1 |
none |
0 |
acc |
↑ |
0.2660 |
± |
0.0044 |
|
|
none |
0 |
acc_norm |
↑ |
0.2697 |
± |
0.0044 |
mmlu |
2 |
none |
|
acc |
↑ |
0.2377 |
± |
0.0036 |
- humanities |
2 |
none |
|
acc |
↑ |
0.2372 |
± |
0.0062 |
- formal_logic |
1 |
none |
0 |
acc |
↑ |
0.2619 |
± |
0.0393 |
- high_school_european_history |
1 |
none |
0 |
acc |
↑ |
0.2182 |
± |
0.0323 |
- high_school_us_history |
1 |
none |
0 |
acc |
↑ |
0.2500 |
± |
0.0304 |
- high_school_world_history |
1 |
none |
0 |
acc |
↑ |
0.2447 |
± |
0.0280 |
- international_law |
1 |
none |
0 |
acc |
↑ |
0.2066 |
± |
0.0370 |
- jurisprudence |
1 |
none |
0 |
acc |
↑ |
0.2778 |
± |
0.0433 |
- logical_fallacies |
1 |
none |
0 |
acc |
↑ |
0.2025 |
± |
0.0316 |
- moral_disputes |
1 |
none |
0 |
acc |
↑ |
0.2514 |
± |
0.0234 |
- moral_scenarios |
1 |
none |
0 |
acc |
↑ |
0.2425 |
± |
0.0143 |
- philosophy |
1 |
none |
0 |
acc |
↑ |
0.1768 |
± |
0.0217 |
- prehistory |
1 |
none |
0 |
acc |
↑ |
0.2562 |
± |
0.0243 |
- professional_law |
1 |
none |
0 |
acc |
↑ |
0.2379 |
± |
0.0109 |
- world_religions |
1 |
none |
0 |
acc |
↑ |
0.2515 |
± |
0.0333 |
- other |
2 |
none |
|
acc |
↑ |
0.2462 |
± |
0.0077 |
- business_ethics |
1 |
none |
0 |
acc |
↑ |
0.2800 |
± |
0.0451 |
- clinical_knowledge |
1 |
none |
0 |
acc |
↑ |
0.2377 |
± |
0.0262 |
- college_medicine |
1 |
none |
0 |
acc |
↑ |
0.2370 |
± |
0.0324 |
- global_facts |
1 |
none |
0 |
acc |
↑ |
0.2500 |
± |
0.0435 |
- human_aging |
1 |
none |
0 |
acc |
↑ |
0.2108 |
± |
0.0274 |
- management |
1 |
none |
0 |
acc |
↑ |
0.1942 |
± |
0.0392 |
- marketing |
1 |
none |
0 |
acc |
↑ |
0.2436 |
± |
0.0281 |
- medical_genetics |
1 |
none |
0 |
acc |
↑ |
0.2700 |
± |
0.0446 |
- miscellaneous |
1 |
none |
0 |
acc |
↑ |
0.2554 |
± |
0.0156 |
- nutrition |
1 |
none |
0 |
acc |
↑ |
0.2778 |
± |
0.0256 |
- professional_accounting |
1 |
none |
0 |
acc |
↑ |
0.2411 |
± |
0.0255 |
- professional_medicine |
1 |
none |
0 |
acc |
↑ |
0.2279 |
± |
0.0255 |
- virology |
1 |
none |
0 |
acc |
↑ |
0.2530 |
± |
0.0338 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.2525 |
± |
0.0078 |
- econometrics |
1 |
none |
0 |
acc |
↑ |
0.2281 |
± |
0.0395 |
- high_school_geography |
1 |
none |
0 |
acc |
↑ |
0.1465 |
± |
0.0252 |
- high_school_government_and_politics |
1 |
none |
0 |
acc |
↑ |
0.2539 |
± |
0.0314 |
- high_school_macroeconomics |
1 |
none |
0 |
acc |
↑ |
0.2333 |
± |
0.0214 |
- high_school_microeconomics |
1 |
none |
0 |
acc |
↑ |
0.2269 |
± |
0.0272 |
- high_school_psychology |
1 |
none |
0 |
acc |
↑ |
0.2330 |
± |
0.0181 |
- human_sexuality |
1 |
none |
0 |
acc |
↑ |
0.2824 |
± |
0.0395 |
- professional_psychology |
1 |
none |
0 |
acc |
↑ |
0.2859 |
± |
0.0183 |
- public_relations |
1 |
none |
0 |
acc |
↑ |
0.2364 |
± |
0.0407 |
- security_studies |
1 |
none |
0 |
acc |
↑ |
0.3388 |
± |
0.0303 |
- sociology |
1 |
none |
0 |
acc |
↑ |
0.2637 |
± |
0.0312 |
- us_foreign_policy |
1 |
none |
0 |
acc |
↑ |
0.2700 |
± |
0.0446 |
- stem |
2 |
none |
|
acc |
↑ |
0.2157 |
± |
0.0073 |
- abstract_algebra |
1 |
none |
0 |
acc |
↑ |
0.2000 |
± |
0.0402 |
- anatomy |
1 |
none |
0 |
acc |
↑ |
0.2222 |
± |
0.0359 |
- astronomy |
1 |
none |
0 |
acc |
↑ |
0.1842 |
± |
0.0315 |
- college_biology |
1 |
none |
0 |
acc |
↑ |
0.2639 |
± |
0.0369 |
- college_chemistry |
1 |
none |
0 |
acc |
↑ |
0.2100 |
± |
0.0409 |
- college_computer_science |
1 |
none |
0 |
acc |
↑ |
0.2400 |
± |
0.0429 |
- college_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2200 |
± |
0.0416 |
- college_physics |
1 |
none |
0 |
acc |
↑ |
0.2059 |
± |
0.0402 |
- computer_security |
1 |
none |
0 |
acc |
↑ |
0.2400 |
± |
0.0429 |
- conceptual_physics |
1 |
none |
0 |
acc |
↑ |
0.2553 |
± |
0.0285 |
- electrical_engineering |
1 |
none |
0 |
acc |
↑ |
0.2414 |
± |
0.0357 |
- elementary_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2328 |
± |
0.0218 |
- high_school_biology |
1 |
none |
0 |
acc |
↑ |
0.1839 |
± |
0.0220 |
- high_school_chemistry |
1 |
none |
0 |
acc |
↑ |
0.1626 |
± |
0.0260 |
- high_school_computer_science |
1 |
none |
0 |
acc |
↑ |
0.2300 |
± |
0.0423 |
- high_school_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2037 |
± |
0.0246 |
- high_school_physics |
1 |
none |
0 |
acc |
↑ |
0.1921 |
± |
0.0322 |
- high_school_statistics |
1 |
none |
0 |
acc |
↑ |
0.1852 |
± |
0.0265 |
- machine_learning |
1 |
none |
0 |
acc |
↑ |
0.2857 |
± |
0.0429 |
truthfulqa_mc2 |
2 |
none |
0 |
acc |
↑ |
0.4650 |
± |
0.0161 |
winogrande |
1 |
none |
0 |
acc |
↑ |
0.4957 |
± |
0.0141 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.2377 |
± |
0.0036 |
- humanities |
2 |
none |
|
acc |
↑ |
0.2372 |
± |
0.0062 |
- other |
2 |
none |
|
acc |
↑ |
0.2462 |
± |
0.0077 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.2525 |
± |
0.0078 |
- stem |
2 |
none |
|
acc |
↑ |
0.2157 |
± |
0.0073 |
litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
leaderboard |
N/A |
|
|
|
|
|
|
|
- leaderboard_bbh |
N/A |
|
|
|
|
|
|
|
- leaderboard_bbh_boolean_expressions |
1 |
none |
3 |
acc_norm |
↑ |
0.4720 |
± |
0.0316 |
- leaderboard_bbh_causal_judgement |
1 |
none |
3 |
acc_norm |
↑ |
0.5187 |
± |
0.0366 |
- leaderboard_bbh_date_understanding |
1 |
none |
3 |
acc_norm |
↑ |
0.2000 |
± |
0.0253 |
- leaderboard_bbh_disambiguation_qa |
1 |
none |
3 |
acc_norm |
↑ |
0.3560 |
± |
0.0303 |
- leaderboard_bbh_formal_fallacies |
1 |
none |
3 |
acc_norm |
↑ |
0.4640 |
± |
0.0316 |
- leaderboard_bbh_geometric_shapes |
1 |
none |
3 |
acc_norm |
↑ |
0.0800 |
± |
0.0172 |
- leaderboard_bbh_hyperbaton |
1 |
none |
3 |
acc_norm |
↑ |
0.5160 |
± |
0.0317 |
- leaderboard_bbh_logical_deduction_five_objects |
1 |
none |
3 |
acc_norm |
↑ |
0.1760 |
± |
0.0241 |
- leaderboard_bbh_logical_deduction_seven_objects |
1 |
none |
3 |
acc_norm |
↑ |
0.1600 |
± |
0.0232 |
- leaderboard_bbh_logical_deduction_three_objects |
1 |
none |
3 |
acc_norm |
↑ |
0.3320 |
± |
0.0298 |
- leaderboard_bbh_movie_recommendation |
1 |
none |
3 |
acc_norm |
↑ |
0.2640 |
± |
0.0279 |
- leaderboard_bbh_navigate |
1 |
none |
3 |
acc_norm |
↑ |
0.5840 |
± |
0.0312 |
- leaderboard_bbh_object_counting |
1 |
none |
3 |
acc_norm |
↑ |
0.0840 |
± |
0.0176 |
- leaderboard_bbh_penguins_in_a_table |
1 |
none |
3 |
acc_norm |
↑ |
0.1986 |
± |
0.0331 |
- leaderboard_bbh_reasoning_about_colored_objects |
1 |
none |
3 |
acc_norm |
↑ |
0.1720 |
± |
0.0239 |
- leaderboard_bbh_ruin_names |
1 |
none |
3 |
acc_norm |
↑ |
0.2360 |
± |
0.0269 |
- leaderboard_bbh_salient_translation_error_detection |
1 |
none |
3 |
acc_norm |
↑ |
0.1560 |
± |
0.0230 |
- leaderboard_bbh_snarks |
1 |
none |
3 |
acc_norm |
↑ |
0.4663 |
± |
0.0375 |
- leaderboard_bbh_sports_understanding |
1 |
none |
3 |
acc_norm |
↑ |
0.4520 |
± |
0.0315 |
- leaderboard_bbh_temporal_sequences |
1 |
none |
3 |
acc_norm |
↑ |
0.2160 |
± |
0.0261 |
- leaderboard_bbh_tracking_shuffled_objects_five_objects |
1 |
none |
3 |
acc_norm |
↑ |
0.2200 |
± |
0.0263 |
- leaderboard_bbh_tracking_shuffled_objects_seven_objects |
1 |
none |
3 |
acc_norm |
↑ |
0.1480 |
± |
0.0225 |
- leaderboard_bbh_tracking_shuffled_objects_three_objects |
1 |
none |
3 |
acc_norm |
↑ |
0.3200 |
± |
0.0296 |
- leaderboard_bbh_web_of_lies |
1 |
none |
3 |
acc_norm |
↑ |
0.4880 |
± |
0.0317 |
- leaderboard_gpqa |
N/A |
|
|
|
|
|
|
|
- leaderboard_gpqa_diamond |
1 |
none |
0 |
acc_norm |
↑ |
0.2020 |
± |
0.0286 |
- leaderboard_gpqa_extended |
1 |
none |
0 |
acc_norm |
↑ |
0.2656 |
± |
0.0189 |
- leaderboard_gpqa_main |
1 |
none |
0 |
acc_norm |
↑ |
0.2656 |
± |
0.0209 |
- leaderboard_ifeval |
3 |
none |
0 |
inst_level_loose_acc |
↑ |
0.2290 |
± |
N/A |
|
|
none |
0 |
inst_level_strict_acc |
↑ |
0.1990 |
± |
N/A |
|
|
none |
0 |
prompt_level_loose_acc |
↑ |
0.1128 |
± |
0.0136 |
|
|
none |
0 |
prompt_level_strict_acc |
↑ |
0.1017 |
± |
0.0130 |
- leaderboard_math_hard |
N/A |
|
|
|
|
|
|
|
- leaderboard_math_algebra_hard |
1 |
none |
4 |
exact_match |
↑ |
0.0000 |
± |
0 |
- leaderboard_math_counting_and_prob_hard |
1 |
none |
4 |
exact_match |
↑ |
0.0000 |
± |
0 |
- leaderboard_math_geometry_hard |
1 |
none |
4 |
exact_match |
↑ |
0.0000 |
± |
0 |
- leaderboard_math_intermediate_algebra_hard |
1 |
none |
4 |
exact_match |
↑ |
0.0000 |
± |
0 |
- leaderboard_math_num_theory_hard |
1 |
none |
4 |
exact_match |
↑ |
0.0000 |
± |
0 |
- leaderboard_math_prealgebra_hard |
1 |
none |
4 |
exact_match |
↑ |
0.0000 |
± |
0 |
- leaderboard_math_precalculus_hard |
1 |
none |
4 |
exact_match |
↑ |
0.0000 |
± |
0 |
- leaderboard_mmlu_pro |
0.1 |
none |
5 |
acc |
↑ |
0.1104 |
± |
0.0029 |
- leaderboard_musr |
N/A |
|
|
|
|
|
|
|
- leaderboard_musr_murder_mysteries |
1 |
none |
0 |
acc_norm |
↑ |
0.4920 |
± |
0.0317 |
- leaderboard_musr_object_placements |
1 |
none |
0 |
acc_norm |
↑ |
0.2891 |
± |
0.0284 |
- leaderboard_musr_team_allocation |
1 |
none |
0 |
acc_norm |
↑ |
0.3440 |
± |
0.0301 |
litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
None
litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.2377 |
± |
0.0036 |
- humanities |
2 |
none |
|
acc |
↑ |
0.2372 |
± |
0.0062 |
- formal_logic |
1 |
none |
0 |
acc |
↑ |
0.2619 |
± |
0.0393 |
- high_school_european_history |
1 |
none |
0 |
acc |
↑ |
0.2182 |
± |
0.0323 |
- high_school_us_history |
1 |
none |
0 |
acc |
↑ |
0.2500 |
± |
0.0304 |
- high_school_world_history |
1 |
none |
0 |
acc |
↑ |
0.2447 |
± |
0.0280 |
- international_law |
1 |
none |
0 |
acc |
↑ |
0.2066 |
± |
0.0370 |
- jurisprudence |
1 |
none |
0 |
acc |
↑ |
0.2778 |
± |
0.0433 |
- logical_fallacies |
1 |
none |
0 |
acc |
↑ |
0.2025 |
± |
0.0316 |
- moral_disputes |
1 |
none |
0 |
acc |
↑ |
0.2514 |
± |
0.0234 |
- moral_scenarios |
1 |
none |
0 |
acc |
↑ |
0.2425 |
± |
0.0143 |
- philosophy |
1 |
none |
0 |
acc |
↑ |
0.1768 |
± |
0.0217 |
- prehistory |
1 |
none |
0 |
acc |
↑ |
0.2562 |
± |
0.0243 |
- professional_law |
1 |
none |
0 |
acc |
↑ |
0.2379 |
± |
0.0109 |
- world_religions |
1 |
none |
0 |
acc |
↑ |
0.2515 |
± |
0.0333 |
- other |
2 |
none |
|
acc |
↑ |
0.2462 |
± |
0.0077 |
- business_ethics |
1 |
none |
0 |
acc |
↑ |
0.2800 |
± |
0.0451 |
- clinical_knowledge |
1 |
none |
0 |
acc |
↑ |
0.2377 |
± |
0.0262 |
- college_medicine |
1 |
none |
0 |
acc |
↑ |
0.2370 |
± |
0.0324 |
- global_facts |
1 |
none |
0 |
acc |
↑ |
0.2500 |
± |
0.0435 |
- human_aging |
1 |
none |
0 |
acc |
↑ |
0.2108 |
± |
0.0274 |
- management |
1 |
none |
0 |
acc |
↑ |
0.1942 |
± |
0.0392 |
- marketing |
1 |
none |
0 |
acc |
↑ |
0.2436 |
± |
0.0281 |
- medical_genetics |
1 |
none |
0 |
acc |
↑ |
0.2700 |
± |
0.0446 |
- miscellaneous |
1 |
none |
0 |
acc |
↑ |
0.2554 |
± |
0.0156 |
- nutrition |
1 |
none |
0 |
acc |
↑ |
0.2778 |
± |
0.0256 |
- professional_accounting |
1 |
none |
0 |
acc |
↑ |
0.2411 |
± |
0.0255 |
- professional_medicine |
1 |
none |
0 |
acc |
↑ |
0.2279 |
± |
0.0255 |
- virology |
1 |
none |
0 |
acc |
↑ |
0.2530 |
± |
0.0338 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.2525 |
± |
0.0078 |
- econometrics |
1 |
none |
0 |
acc |
↑ |
0.2281 |
± |
0.0395 |
- high_school_geography |
1 |
none |
0 |
acc |
↑ |
0.1465 |
± |
0.0252 |
- high_school_government_and_politics |
1 |
none |
0 |
acc |
↑ |
0.2539 |
± |
0.0314 |
- high_school_macroeconomics |
1 |
none |
0 |
acc |
↑ |
0.2333 |
± |
0.0214 |
- high_school_microeconomics |
1 |
none |
0 |
acc |
↑ |
0.2269 |
± |
0.0272 |
- high_school_psychology |
1 |
none |
0 |
acc |
↑ |
0.2330 |
± |
0.0181 |
- human_sexuality |
1 |
none |
0 |
acc |
↑ |
0.2824 |
± |
0.0395 |
- professional_psychology |
1 |
none |
0 |
acc |
↑ |
0.2859 |
± |
0.0183 |
- public_relations |
1 |
none |
0 |
acc |
↑ |
0.2364 |
± |
0.0407 |
- security_studies |
1 |
none |
0 |
acc |
↑ |
0.3388 |
± |
0.0303 |
- sociology |
1 |
none |
0 |
acc |
↑ |
0.2637 |
± |
0.0312 |
- us_foreign_policy |
1 |
none |
0 |
acc |
↑ |
0.2700 |
± |
0.0446 |
- stem |
2 |
none |
|
acc |
↑ |
0.2157 |
± |
0.0073 |
- abstract_algebra |
1 |
none |
0 |
acc |
↑ |
0.2000 |
± |
0.0402 |
- anatomy |
1 |
none |
0 |
acc |
↑ |
0.2222 |
± |
0.0359 |
- astronomy |
1 |
none |
0 |
acc |
↑ |
0.1842 |
± |
0.0315 |
- college_biology |
1 |
none |
0 |
acc |
↑ |
0.2639 |
± |
0.0369 |
- college_chemistry |
1 |
none |
0 |
acc |
↑ |
0.2100 |
± |
0.0409 |
- college_computer_science |
1 |
none |
0 |
acc |
↑ |
0.2400 |
± |
0.0429 |
- college_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2200 |
± |
0.0416 |
- college_physics |
1 |
none |
0 |
acc |
↑ |
0.2059 |
± |
0.0402 |
- computer_security |
1 |
none |
0 |
acc |
↑ |
0.2400 |
± |
0.0429 |
- conceptual_physics |
1 |
none |
0 |
acc |
↑ |
0.2553 |
± |
0.0285 |
- electrical_engineering |
1 |
none |
0 |
acc |
↑ |
0.2414 |
± |
0.0357 |
- elementary_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2328 |
± |
0.0218 |
- high_school_biology |
1 |
none |
0 |
acc |
↑ |
0.1839 |
± |
0.0220 |
- high_school_chemistry |
1 |
none |
0 |
acc |
↑ |
0.1626 |
± |
0.0260 |
- high_school_computer_science |
1 |
none |
0 |
acc |
↑ |
0.2300 |
± |
0.0423 |
- high_school_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2037 |
± |
0.0246 |
- high_school_physics |
1 |
none |
0 |
acc |
↑ |
0.1921 |
± |
0.0322 |
- high_school_statistics |
1 |
none |
0 |
acc |
↑ |
0.1852 |
± |
0.0265 |
- machine_learning |
1 |
none |
0 |
acc |
↑ |
0.2857 |
± |
0.0429 |
mmlu_pro |
2 |
custom-extract |
|
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- biology |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- business |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- chemistry |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- computer_science |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- economics |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- engineering |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- health |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- history |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- law |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- math |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- other |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- philosophy |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- physics |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- psychology |
1 |
custom-extract |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.2377 |
± |
0.0036 |
- humanities |
2 |
none |
|
acc |
↑ |
0.2372 |
± |
0.0062 |
- other |
2 |
none |
|
acc |
↑ |
0.2462 |
± |
0.0077 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.2525 |
± |
0.0078 |
- stem |
2 |
none |
|
acc |
↑ |
0.2157 |
± |
0.0073 |
mmlu_pro |
2 |
custom-extract |
|
exact_match |
↑ |
0.0000 |
± |
0.0000 |
litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
arc_challenge |
1 |
none |
0 |
acc |
↑ |
0.1903 |
± |
0.0115 |
|
|
none |
0 |
acc_norm |
↑ |
0.2406 |
± |
0.0125 |
boolq |
2 |
none |
0 |
acc |
↑ |
0.5838 |
± |
0.0086 |
gpqa_diamond_cot_n_shot |
2 |
flexible-extract |
0 |
exact_match |
↑ |
0.1212 |
± |
0.0233 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_diamond_cot_zeroshot |
1 |
flexible-extract |
0 |
exact_match |
↑ |
0.1465 |
± |
0.0252 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_diamond_generative_n_shot |
2 |
flexible-extract |
0 |
exact_match |
↑ |
0.2273 |
± |
0.0299 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_diamond_n_shot |
2 |
none |
0 |
acc |
↑ |
0.2475 |
± |
0.0307 |
|
|
none |
0 |
acc_norm |
↑ |
0.2475 |
± |
0.0307 |
gpqa_diamond_zeroshot |
1 |
none |
0 |
acc |
↑ |
0.2273 |
± |
0.0299 |
|
|
none |
0 |
acc_norm |
↑ |
0.2273 |
± |
0.0299 |
gpqa_extended_cot_n_shot |
2 |
flexible-extract |
0 |
exact_match |
↑ |
0.1392 |
± |
0.0148 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_extended_cot_zeroshot |
1 |
flexible-extract |
0 |
exact_match |
↑ |
0.1502 |
± |
0.0153 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_extended_generative_n_shot |
2 |
flexible-extract |
0 |
exact_match |
↑ |
0.2289 |
± |
0.0180 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_extended_n_shot |
2 |
none |
0 |
acc |
↑ |
0.2344 |
± |
0.0181 |
|
|
none |
0 |
acc_norm |
↑ |
0.2344 |
± |
0.0181 |
gpqa_extended_zeroshot |
1 |
none |
0 |
acc |
↑ |
0.2582 |
± |
0.0187 |
|
|
none |
0 |
acc_norm |
↑ |
0.2582 |
± |
0.0187 |
gpqa_main_cot_n_shot |
2 |
flexible-extract |
0 |
exact_match |
↑ |
0.1429 |
± |
0.0166 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_main_cot_zeroshot |
1 |
flexible-extract |
0 |
exact_match |
↑ |
0.1629 |
± |
0.0175 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_main_generative_n_shot |
2 |
flexible-extract |
0 |
exact_match |
↑ |
0.2366 |
± |
0.0201 |
|
|
strict-match |
0 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
gpqa_main_n_shot |
2 |
none |
0 |
acc |
↑ |
0.2500 |
± |
0.0205 |
|
|
none |
0 |
acc_norm |
↑ |
0.2500 |
± |
0.0205 |
gpqa_main_zeroshot |
1 |
none |
0 |
acc |
↑ |
0.2746 |
± |
0.0211 |
|
|
none |
0 |
acc_norm |
↑ |
0.2746 |
± |
0.0211 |
hellaswag |
1 |
none |
0 |
acc |
↑ |
0.2658 |
± |
0.0044 |
|
|
none |
0 |
acc_norm |
↑ |
0.2690 |
± |
0.0044 |
openbookqa |
1 |
none |
0 |
acc |
↑ |
0.1380 |
± |
0.0154 |
|
|
none |
0 |
acc_norm |
↑ |
0.2740 |
± |
0.0200 |
piqa |
1 |
none |
0 |
acc |
↑ |
0.5555 |
± |
0.0116 |
|
|
none |
0 |
acc_norm |
↑ |
0.5571 |
± |
0.0116 |
truthfulqa_mc2 |
2 |
none |
0 |
acc |
↑ |
0.4650 |
± |
0.0160 |
winogrande |
1 |
none |
0 |
acc |
↑ |
0.4949 |
± |
0.0141 |
litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
None
litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.0136 |
± |
0.0032 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.0008 |
± |
0.0008 |
mathqa |
1 |
none |
0 |
acc |
↑ |
0.2191 |
± |
0.0076 |
|
|
none |
0 |
acc_norm |
↑ |
0.2181 |
± |
0.0076 |
litgpt evaluate --tasks 'wikitext,qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
qasper_bool |
1 |
none |
0 |
f1 |
↑ |
0.8215 |
± |
0.0222 |
qasper_freeform |
2 |
none |
0 |
f1_abstractive |
↑ |
0.0390 |
± |
0.0045 |
wikitext |
2 |
none |
0 |
bits_per_byte |
↓ |
2.6525 |
± |
N/A |
|
|
none |
0 |
byte_perplexity |
↓ |
6.2874 |
± |
N/A |
|
|
none |
0 |
word_perplexity |
↓ |
18611.9448 |
± |
N/A |