{ | |
"act_fn": "gelu_new", | |
"n_layers": 2, | |
"d_model": 256, | |
"n_ctx": 2048, | |
"d_head": 32, | |
"d_mlp": -1, | |
"model_name": "custom", | |
"n_heads": 8, | |
"d_vocab": 50259, | |
"eps": 1e-05, | |
"use_attn_result": true, | |
"use_attn_scale": true, | |
"use_split_qkv_input": false, | |
"use_local_attn": false, | |
"from_checkpoint": false, | |
"init_mode": "gpt2", | |
"normalization_type": "LN", | |
"device": "cuda", | |
"attention_dir": "causal", | |
"attn_only": true, | |
"initializer_range": 0.05, | |
"init_weights": false, | |
"scale_attn_by_inverse_layer_idx": false, | |
"positional_embedding_type": "shortformer", | |
"shortformer_pos": | |
true, | |
"final_rms": false, | |
"d_vocab_out": 50259, | |
"parallel_attn_mlp": false, | |
"n_params": 524288, | |
"final_rms": false, | |
"use_hook_tokens": false, | |
"tokenizer_name": "ArthurConmy/redwood_tokenizer" | |
} |