{ "_name_or_path": "./MoLM-700M-4B", "activation_function": "gelu_new", "architectures": [ "ModuleFormerForCausalLM" ], "att_func": "stickbreaking", "att_hidden": 1024, "attn_pdrop": 0, "aux_loss_type": "mi", "aux_loss_weight": 0, "block_size": 512, "bos_token_id": 50256, "embd_pdrop": 0, "eos_token_id": 50256, "ffd_hidden": 2048, "gate_type": "mlp", "gating_size": 256, "history_length": 512, "initializer_range": 0.02, "k_att": 4, "k_mlp": 4, "layer_norm_epsilon": 1e-05, "local_size": 1, "model_type": "moduleformer", "moe_pdrop": 0, "moe_type": "moe", "n_att_experts": 16, "n_ctx": 12288, "n_embd": 1024, "n_head": 1, "n_layer": 24, "n_mlp_experts": 32, "pre_norm": true, "resid_pdrop": 0, "sample_topk": 0, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.28.1", "universal": false, "use_cache": true, "vocab_size": 50295, "world_size": null }