|
{ |
|
"n_layers": 1, |
|
"d_model": 128, |
|
"d_mlp": 512, |
|
"d_head": 32, |
|
"n_heads": 4, |
|
"lr_hidden": 0.002, |
|
"lr_vector": 0.001, |
|
"batch_size_per_device": 32, |
|
"batches_per_step": 1, |
|
"seed": 1297, |
|
"save_checkpoints": true, |
|
"debug": false, |
|
"debug_batch": false, |
|
"normalization": "LN", |
|
"max_tokens": 10000000000, |
|
"version": 426, |
|
"use_bfloat16_matmul": true, |
|
"n_ctx": 1024, |
|
"d_vocab": 48262, |
|
"tokenizer_name": "NeelNanda/gpt-neox-tokenizer-digits", |
|
"betas": [ |
|
0.9, |
|
0.99 |
|
], |
|
"weight_decay": 0.05, |
|
"dataset_name": "c4", |
|
"grad_norm_clip": 1.0, |
|
"n_devices": 8, |
|
"act_fn": "solu_ln", |
|
"shortformer_pos": false, |
|
"attn_only": false, |
|
"ln_eps": 1e-05, |
|
"lr_schedule": "cosine_warmup", |
|
"warmup_tokens": 300000000, |
|
"train_loss_ewma_beta": 0.99, |
|
"truncate_tokens": 1000000000000, |
|
"log_interval": 50, |
|
"initializer_scale_global": 1.0, |
|
"initializer_scale_hidden": 0.02, |
|
"initializer_scale_embed": 0.1, |
|
"initializer_scale_unembed": 0.02, |
|
"neuron_scale": 1.0, |
|
"neuron_temp": 1.0, |
|
"use_acc": false, |
|
"weight_init_scheme": "gpt2", |
|
"fixed_init": "", |
|
"store_init": false, |
|
"control": 1.0, |
|
"tokens_per_step": 262144, |
|
"batch_size": 256, |
|
"max_steps": 38146, |
|
"warmup_steps": 1144, |
|
"n_params": 196608 |
|
} |