{ | |
"activation": "relu_squared", | |
"bias": false, | |
"d_model": 512, | |
"dropout": 0.2, | |
"hidden_dim": 2048, | |
"mlp": "GLU", | |
"num_heads": 32, | |
"num_kv_heads": 0, | |
"num_layers": 32, | |
"seq_len": 256, | |
"vocab_size": 50257, | |
"weight_tying": true | |
} |
{ | |
"activation": "relu_squared", | |
"bias": false, | |
"d_model": 512, | |
"dropout": 0.2, | |
"hidden_dim": 2048, | |
"mlp": "GLU", | |
"num_heads": 32, | |
"num_kv_heads": 0, | |
"num_layers": 32, | |
"seq_len": 256, | |
"vocab_size": 50257, | |
"weight_tying": true | |
} |