Train
Environment
cd scripts
python -m venv venv
source venv/bin/activate
pip install -U -r requirements.in
Tokenizer
python -B train_tokenizer.py
Dataset
python -B prepare_pretrain_dataset.py
from litdata import StreamingDataset, StreamingDataLoader, TokensLoader
dataset = StreamingDataset(
input_dir='../pretrain-data/',
item_loader=TokensLoader(block_size=2048 + 1),
)
print(len(dataset))
Model
Pretrain
litgpt pretrain --config ./pretrain-model.yaml
litgpt convert_from_litgpt out/pretrain/final/ out/converted_model
cp config.json out/pretrain/final/
cp config.json out/converted_model/
import torch
from safetensors.torch import save_file
state_dict = torch.load('out/converted_model/model.pth', map_location='cpu')
save_file(state_dict, 'out/converted_model/model.safetensors')
Evaluate
litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
arc_challenge |
1 |
none |
0 |
acc |
↑ |
0.1962 |
± |
0.0116 |
|
|
none |
0 |
acc_norm |
↑ |
0.2304 |
± |
0.0123 |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.0144 |
± |
0.0033 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.0015 |
± |
0.0011 |
hellaswag |
1 |
none |
0 |
acc |
↑ |
0.2631 |
± |
0.0044 |
|
|
none |
0 |
acc_norm |
↑ |
0.2758 |
± |
0.0045 |
mmlu |
2 |
none |
|
acc |
↑ |
0.2473 |
± |
0.0036 |
- humanities |
2 |
none |
|
acc |
↑ |
0.2351 |
± |
0.0062 |
- formal_logic |
1 |
none |
0 |
acc |
↑ |
0.2857 |
± |
0.0404 |
- high_school_european_history |
1 |
none |
0 |
acc |
↑ |
0.2667 |
± |
0.0345 |
- high_school_us_history |
1 |
none |
0 |
acc |
↑ |
0.2696 |
± |
0.0311 |
- high_school_world_history |
1 |
none |
0 |
acc |
↑ |
0.2110 |
± |
0.0266 |
- international_law |
1 |
none |
0 |
acc |
↑ |
0.1653 |
± |
0.0339 |
- jurisprudence |
1 |
none |
0 |
acc |
↑ |
0.2870 |
± |
0.0437 |
- logical_fallacies |
1 |
none |
0 |
acc |
↑ |
0.2331 |
± |
0.0332 |
- moral_disputes |
1 |
none |
0 |
acc |
↑ |
0.2283 |
± |
0.0226 |
- moral_scenarios |
1 |
none |
0 |
acc |
↑ |
0.2425 |
± |
0.0143 |
- philosophy |
1 |
none |
0 |
acc |
↑ |
0.2186 |
± |
0.0235 |
- prehistory |
1 |
none |
0 |
acc |
↑ |
0.2099 |
± |
0.0227 |
- professional_law |
1 |
none |
0 |
acc |
↑ |
0.2314 |
± |
0.0108 |
- world_religions |
1 |
none |
0 |
acc |
↑ |
0.2632 |
± |
0.0338 |
- other |
2 |
none |
|
acc |
↑ |
0.2485 |
± |
0.0078 |
- business_ethics |
1 |
none |
0 |
acc |
↑ |
0.2600 |
± |
0.0441 |
- clinical_knowledge |
1 |
none |
0 |
acc |
↑ |
0.2528 |
± |
0.0267 |
- college_medicine |
1 |
none |
0 |
acc |
↑ |
0.2254 |
± |
0.0319 |
- global_facts |
1 |
none |
0 |
acc |
↑ |
0.2700 |
± |
0.0446 |
- human_aging |
1 |
none |
0 |
acc |
↑ |
0.2377 |
± |
0.0286 |
- management |
1 |
none |
0 |
acc |
↑ |
0.2816 |
± |
0.0445 |
- marketing |
1 |
none |
0 |
acc |
↑ |
0.2692 |
± |
0.0291 |
- medical_genetics |
1 |
none |
0 |
acc |
↑ |
0.2600 |
± |
0.0441 |
- miscellaneous |
1 |
none |
0 |
acc |
↑ |
0.2350 |
± |
0.0152 |
- nutrition |
1 |
none |
0 |
acc |
↑ |
0.2549 |
± |
0.0250 |
- professional_accounting |
1 |
none |
0 |
acc |
↑ |
0.2801 |
± |
0.0268 |
- professional_medicine |
1 |
none |
0 |
acc |
↑ |
0.2610 |
± |
0.0267 |
- virology |
1 |
none |
0 |
acc |
↑ |
0.1807 |
± |
0.0300 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.2658 |
± |
0.0080 |
- econometrics |
1 |
none |
0 |
acc |
↑ |
0.1930 |
± |
0.0371 |
- high_school_geography |
1 |
none |
0 |
acc |
↑ |
0.2172 |
± |
0.0294 |
- high_school_government_and_politics |
1 |
none |
0 |
acc |
↑ |
0.3212 |
± |
0.0337 |
- high_school_macroeconomics |
1 |
none |
0 |
acc |
↑ |
0.2923 |
± |
0.0231 |
- high_school_microeconomics |
1 |
none |
0 |
acc |
↑ |
0.3025 |
± |
0.0298 |
- high_school_psychology |
1 |
none |
0 |
acc |
↑ |
0.2752 |
± |
0.0191 |
- human_sexuality |
1 |
none |
0 |
acc |
↑ |
0.2290 |
± |
0.0369 |
- professional_psychology |
1 |
none |
0 |
acc |
↑ |
0.2386 |
± |
0.0172 |
- public_relations |
1 |
none |
0 |
acc |
↑ |
0.2636 |
± |
0.0422 |
- security_studies |
1 |
none |
0 |
acc |
↑ |
0.3143 |
± |
0.0297 |
- sociology |
1 |
none |
0 |
acc |
↑ |
0.2338 |
± |
0.0299 |
- us_foreign_policy |
1 |
none |
0 |
acc |
↑ |
0.2600 |
± |
0.0441 |
- stem |
2 |
none |
|
acc |
↑ |
0.2464 |
± |
0.0077 |
- abstract_algebra |
1 |
none |
0 |
acc |
↑ |
0.2500 |
± |
0.0435 |
- anatomy |
1 |
none |
0 |
acc |
↑ |
0.2148 |
± |
0.0355 |
- astronomy |
1 |
none |
0 |
acc |
↑ |
0.1908 |
± |
0.0320 |
- college_biology |
1 |
none |
0 |
acc |
↑ |
0.2569 |
± |
0.0365 |
- college_chemistry |
1 |
none |
0 |
acc |
↑ |
0.2700 |
± |
0.0446 |
- college_computer_science |
1 |
none |
0 |
acc |
↑ |
0.3500 |
± |
0.0479 |
- college_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2700 |
± |
0.0446 |
- college_physics |
1 |
none |
0 |
acc |
↑ |
0.2745 |
± |
0.0444 |
- computer_security |
1 |
none |
0 |
acc |
↑ |
0.3000 |
± |
0.0461 |
- conceptual_physics |
1 |
none |
0 |
acc |
↑ |
0.2766 |
± |
0.0292 |
- electrical_engineering |
1 |
none |
0 |
acc |
↑ |
0.2345 |
± |
0.0353 |
- elementary_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2566 |
± |
0.0225 |
- high_school_biology |
1 |
none |
0 |
acc |
↑ |
0.2226 |
± |
0.0237 |
- high_school_chemistry |
1 |
none |
0 |
acc |
↑ |
0.2217 |
± |
0.0292 |
- high_school_computer_science |
1 |
none |
0 |
acc |
↑ |
0.2000 |
± |
0.0402 |
- high_school_mathematics |
1 |
none |
0 |
acc |
↑ |
0.2370 |
± |
0.0259 |
- high_school_physics |
1 |
none |
0 |
acc |
↑ |
0.2517 |
± |
0.0354 |
- high_school_statistics |
1 |
none |
0 |
acc |
↑ |
0.2685 |
± |
0.0302 |
- machine_learning |
1 |
none |
0 |
acc |
↑ |
0.1786 |
± |
0.0364 |
truthfulqa_mc2 |
2 |
none |
0 |
acc |
↑ |
0.4668 |
± |
0.0161 |
winogrande |
1 |
none |
0 |
acc |
↑ |
0.5012 |
± |
0.0141 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.2473 |
± |
0.0036 |
- humanities |
2 |
none |
|
acc |
↑ |
0.2351 |
± |
0.0062 |
- other |
2 |
none |
|
acc |
↑ |
0.2485 |
± |
0.0078 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.2658 |
± |
0.0080 |
- stem |
2 |
none |
|
acc |
↑ |
0.2464 |
± |
0.0077 |
litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/
litgpt evaluate --tasks 'qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/