|
{ |
|
"_name_or_path": "./MoLM-700M-4B", |
|
"activation_function": "gelu_new", |
|
"architectures": [ |
|
"ModuleFormerForCausalLM" |
|
], |
|
"att_func": "stickbreaking", |
|
"att_hidden": 1024, |
|
"attn_pdrop": 0, |
|
"aux_loss_type": "mi", |
|
"aux_loss_weight": 0, |
|
"block_size": 512, |
|
"bos_token_id": 50256, |
|
"embd_pdrop": 0, |
|
"eos_token_id": 50256, |
|
"ffd_hidden": 2048, |
|
"gate_type": "mlp", |
|
"gating_size": 256, |
|
"history_length": 512, |
|
"initializer_range": 0.02, |
|
"k_att": 4, |
|
"k_mlp": 4, |
|
"layer_norm_epsilon": 1e-05, |
|
"local_size": 1, |
|
"model_type": "moduleformer", |
|
"moe_pdrop": 0, |
|
"moe_type": "moe", |
|
"n_att_experts": 16, |
|
"n_ctx": 12288, |
|
"n_embd": 1024, |
|
"n_head": 1, |
|
"n_layer": 24, |
|
"n_mlp_experts": 32, |
|
"pre_norm": true, |
|
"resid_pdrop": 0, |
|
"sample_topk": 0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.28.1", |
|
"universal": false, |
|
"use_cache": true, |
|
"vocab_size": 50295, |
|
"world_size": null |
|
} |
|
|