|
arch: |
|
type: TransformerLMHeadModel |
|
args: |
|
transformer_config: |
|
type: TransformerDecoderOnlyModel |
|
args: |
|
embed_config: |
|
type: TransformerEmbeddingBlock |
|
args: |
|
token_embed_config: |
|
type: TokenEmbedding |
|
args: |
|
n_embed: 512 |
|
n_vocab: 50304 |
|
pos_embed_config: null |
|
type_embed_config: null |
|
ln_config: null |
|
p_drop_embed: 0.0 |
|
concat_strategy: id_first |
|
decoder_config: |
|
type: ParallelTransformerDecoderBlock |
|
args: |
|
attn_config: |
|
type: GPTNeoXAttention |
|
args: |
|
n_embed: 512 |
|
n_pos: 2048 |
|
n_head: 8 |
|
n_key_value_head: 8 |
|
head_size: 64 |
|
p_drop_attn: 0.0 |
|
p_drop_resid: 0.0 |
|
bias_attn: true |
|
bias_proj: true |
|
cross_attn: false |
|
scale_dot_product: true |
|
scale_layer_wise: false |
|
layer_idx: null |
|
rope_config: |
|
type: MistralRotaryEmbedding |
|
args: |
|
rotary_head_size: 16 |
|
n_pos: 2048 |
|
base: 10000 |
|
scaling_type: null |
|
scaling_factor: null |
|
perform_bloom_split_head: true |
|
mlp_config: |
|
type: TransformerMLP |
|
args: |
|
n_embed: 512 |
|
n_inner: 2048 |
|
act_fn_config: |
|
type: NewGELUActivation |
|
args: {} |
|
p_drop_mlp: 0.0 |
|
ln_config: |
|
type: LayerNorm |
|
args: |
|
n_embed: 512 |
|
ln_eps: 1.0e-05 |
|
n_embed: 512 |
|
post_norm: false |
|
add_cross_attn: false |
|
share_layer_norm: false |
|
n_embed: 512 |
|
n_layer: 6 |
|
n_head: 8 |
|
ln_config: |
|
type: LayerNorm |
|
args: |
|
n_embed: 512 |
|
ln_eps: 1.0e-05 |
|
perform_linear_bias: false |
|
attn_window_size_loop_unit: null |
|
lm_head_config: |
|
type: TransformerLMHead |
|
args: |
|
n_vocab: 50304 |
|
n_embed: 512 |
|
bias_lm_head: false |
|
perform_transform: false |
|
act_fn_config: null |
|
ln_config: null |
|
|