|
{ |
|
"_name_or_path": "/home/dinalt/ai_assets/models/walsh_instruct", |
|
"activation_args": {}, |
|
"activation_cls": "torch.nn.GELU", |
|
"architectures": [ |
|
"HFCausalModel" |
|
], |
|
"attention_args": { |
|
"beta": 0.25, |
|
"dropout": 0.1 |
|
}, |
|
"attention_cls": ".CausalSelfAttention", |
|
"auto_map": { |
|
"AutoConfig": "modelling_walsh.Config", |
|
"AutoModelForCausalLM": "modelling_walsh.HFCausalModel" |
|
}, |
|
"d_embed": 2048, |
|
"dim_feedforward": 8192, |
|
"dropout": 0.1, |
|
"embdding_cls": "torch.nn.Embedding", |
|
"embedding_args": {}, |
|
"feedforward_args": { |
|
"beta": 0.25, |
|
"bias": true |
|
}, |
|
"feedforward_cls": ".FeedforwardLayer", |
|
"head_args": {}, |
|
"head_cls": ".Transformer", |
|
"init_gain": 1.0, |
|
"layer_args": { |
|
"alpha": 2.828427124746 |
|
}, |
|
"layer_cls": ".DeepnetLayer", |
|
"layer_stack_args": {}, |
|
"layer_stack_cls": ".TransformerLayerStack", |
|
"loss_function": ".causal_loss", |
|
"max_sequence_length": 16384, |
|
"model_type": "walsh-causal-v1", |
|
"norm_args": { |
|
"normalized_shape": 2084 |
|
}, |
|
"norm_cls": "torch.nn.LayerNorm", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"output_proj_args": {}, |
|
"output_proj_cls": "torch.nn.Linear", |
|
"pad_index": null, |
|
"positional_encoder_args": { |
|
"d_embed": 2048, |
|
"gain": 0.3333, |
|
"max_seq": 16384 |
|
}, |
|
"positional_encoder_cls": ".RSWalshPositionalEncoder", |
|
"torch_dtype": "bfloat16", |
|
"transformer_args": {}, |
|
"transformer_cls": ".Transformer", |
|
"transformers_version": "4.37.2", |
|
"vocab_size": 32000 |
|
} |
|
|