|
{ |
|
"n_layers": 2, |
|
"d_model": 256, |
|
"n_ctx": 2048, |
|
"d_head": 32, |
|
"model_name": "custom", |
|
"n_heads": 8, |
|
"d_vocab": 50259, |
|
"eps": 1e-05, |
|
"use_attn_result": True, |
|
"use_attn_scale": True, |
|
"use_split_qkv_input": False, |
|
"use_local_attn": False, |
|
"from_checkpoint": False, |
|
"init_mode": "gpt2", |
|
"normalization_type": "LN", |
|
"device": "cuda", |
|
"attention_dir": "causal", |
|
"attn_only": True, |
|
"initializer_range": 0.05, |
|
"init_weights": True, |
|
"scale_attn_by_inverse_layer_idx": False, |
|
"positional_embedding_type": "shortformer", |
|
"final_rms": False, |
|
"d_vocab_out": 50259, |
|
"parallel_attn_mlp": False, |
|
"n_params": 524288, |
|
"use_hook_tokens": False, |
|
} |