imfinethx's picture
for cnn_dm
2e144f7
{
"_name_or_path": "checkpoints/microsoft/phi-1_5",
"anyprec": {
"arch_config": {
"layers_name": "layers",
"model_name": "model",
"module_names": [
"self_attn.q_proj",
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.dense",
"mlp.fc1",
"mlp.fc2"
]
},
"group_count": 1,
"parent_precision": 4,
"seed_precision": 2,
"sparse_numvals": {
"model.layers.0.mlp.fc1": 675311,
"model.layers.0.mlp.fc2": 624542,
"model.layers.0.self_attn.dense": 82454,
"model.layers.0.self_attn.k_proj": 180985,
"model.layers.0.self_attn.q_proj": 156889,
"model.layers.0.self_attn.v_proj": 94332,
"model.layers.1.mlp.fc1": 201243,
"model.layers.1.mlp.fc2": 325345,
"model.layers.1.self_attn.dense": 73540,
"model.layers.1.self_attn.k_proj": 103462,
"model.layers.1.self_attn.q_proj": 99058,
"model.layers.1.self_attn.v_proj": 86333,
"model.layers.10.mlp.fc1": 295445,
"model.layers.10.mlp.fc2": 323451,
"model.layers.10.self_attn.dense": 72926,
"model.layers.10.self_attn.k_proj": 100535,
"model.layers.10.self_attn.q_proj": 94643,
"model.layers.10.self_attn.v_proj": 85469,
"model.layers.11.mlp.fc1": 291450,
"model.layers.11.mlp.fc2": 321914,
"model.layers.11.self_attn.dense": 71540,
"model.layers.11.self_attn.k_proj": 97013,
"model.layers.11.self_attn.q_proj": 89427,
"model.layers.11.self_attn.v_proj": 82468,
"model.layers.12.mlp.fc1": 285509,
"model.layers.12.mlp.fc2": 328599,
"model.layers.12.self_attn.dense": 69830,
"model.layers.12.self_attn.k_proj": 101851,
"model.layers.12.self_attn.q_proj": 94202,
"model.layers.12.self_attn.v_proj": 84071,
"model.layers.13.mlp.fc1": 277413,
"model.layers.13.mlp.fc2": 308466,
"model.layers.13.self_attn.dense": 74257,
"model.layers.13.self_attn.k_proj": 101329,
"model.layers.13.self_attn.q_proj": 94394,
"model.layers.13.self_attn.v_proj": 83090,
"model.layers.14.mlp.fc1": 272080,
"model.layers.14.mlp.fc2": 347434,
"model.layers.14.self_attn.dense": 77486,
"model.layers.14.self_attn.k_proj": 99568,
"model.layers.14.self_attn.q_proj": 97367,
"model.layers.14.self_attn.v_proj": 85949,
"model.layers.15.mlp.fc1": 262687,
"model.layers.15.mlp.fc2": 326298,
"model.layers.15.self_attn.dense": 71091,
"model.layers.15.self_attn.k_proj": 99666,
"model.layers.15.self_attn.q_proj": 114009,
"model.layers.15.self_attn.v_proj": 79472,
"model.layers.16.mlp.fc1": 255098,
"model.layers.16.mlp.fc2": 365511,
"model.layers.16.self_attn.dense": 69991,
"model.layers.16.self_attn.k_proj": 97043,
"model.layers.16.self_attn.q_proj": 101120,
"model.layers.16.self_attn.v_proj": 75770,
"model.layers.17.mlp.fc1": 245724,
"model.layers.17.mlp.fc2": 337114,
"model.layers.17.self_attn.dense": 70591,
"model.layers.17.self_attn.k_proj": 92819,
"model.layers.17.self_attn.q_proj": 91892,
"model.layers.17.self_attn.v_proj": 74253,
"model.layers.18.mlp.fc1": 240816,
"model.layers.18.mlp.fc2": 335322,
"model.layers.18.self_attn.dense": 79137,
"model.layers.18.self_attn.k_proj": 98409,
"model.layers.18.self_attn.q_proj": 123879,
"model.layers.18.self_attn.v_proj": 85293,
"model.layers.19.mlp.fc1": 234256,
"model.layers.19.mlp.fc2": 317669,
"model.layers.19.self_attn.dense": 80277,
"model.layers.19.self_attn.k_proj": 97895,
"model.layers.19.self_attn.q_proj": 121723,
"model.layers.19.self_attn.v_proj": 82971,
"model.layers.2.mlp.fc1": 233518,
"model.layers.2.mlp.fc2": 315355,
"model.layers.2.self_attn.dense": 68242,
"model.layers.2.self_attn.k_proj": 102966,
"model.layers.2.self_attn.q_proj": 98281,
"model.layers.2.self_attn.v_proj": 83855,
"model.layers.20.mlp.fc1": 230157,
"model.layers.20.mlp.fc2": 317412,
"model.layers.20.self_attn.dense": 70557,
"model.layers.20.self_attn.k_proj": 96874,
"model.layers.20.self_attn.q_proj": 117460,
"model.layers.20.self_attn.v_proj": 75849,
"model.layers.21.mlp.fc1": 227363,
"model.layers.21.mlp.fc2": 323600,
"model.layers.21.self_attn.dense": 73035,
"model.layers.21.self_attn.k_proj": 93176,
"model.layers.21.self_attn.q_proj": 124248,
"model.layers.21.self_attn.v_proj": 75505,
"model.layers.22.mlp.fc1": 233020,
"model.layers.22.mlp.fc2": 395456,
"model.layers.22.self_attn.dense": 71502,
"model.layers.22.self_attn.k_proj": 88462,
"model.layers.22.self_attn.q_proj": 162865,
"model.layers.22.self_attn.v_proj": 73909,
"model.layers.23.mlp.fc1": 285355,
"model.layers.23.mlp.fc2": 631745,
"model.layers.23.self_attn.dense": 101963,
"model.layers.23.self_attn.k_proj": 107304,
"model.layers.23.self_attn.q_proj": 260586,
"model.layers.23.self_attn.v_proj": 107005,
"model.layers.3.mlp.fc1": 269841,
"model.layers.3.mlp.fc2": 330081,
"model.layers.3.self_attn.dense": 76738,
"model.layers.3.self_attn.k_proj": 114282,
"model.layers.3.self_attn.q_proj": 110068,
"model.layers.3.self_attn.v_proj": 96670,
"model.layers.4.mlp.fc1": 305604,
"model.layers.4.mlp.fc2": 333478,
"model.layers.4.self_attn.dense": 73404,
"model.layers.4.self_attn.k_proj": 105649,
"model.layers.4.self_attn.q_proj": 102666,
"model.layers.4.self_attn.v_proj": 92391,
"model.layers.5.mlp.fc1": 293406,
"model.layers.5.mlp.fc2": 337582,
"model.layers.5.self_attn.dense": 71678,
"model.layers.5.self_attn.k_proj": 120017,
"model.layers.5.self_attn.q_proj": 121205,
"model.layers.5.self_attn.v_proj": 92099,
"model.layers.6.mlp.fc1": 291972,
"model.layers.6.mlp.fc2": 329924,
"model.layers.6.self_attn.dense": 81259,
"model.layers.6.self_attn.k_proj": 104051,
"model.layers.6.self_attn.q_proj": 100833,
"model.layers.6.self_attn.v_proj": 93397,
"model.layers.7.mlp.fc1": 293548,
"model.layers.7.mlp.fc2": 331966,
"model.layers.7.self_attn.dense": 68519,
"model.layers.7.self_attn.k_proj": 108909,
"model.layers.7.self_attn.q_proj": 103642,
"model.layers.7.self_attn.v_proj": 84278,
"model.layers.8.mlp.fc1": 304480,
"model.layers.8.mlp.fc2": 318568,
"model.layers.8.self_attn.dense": 76294,
"model.layers.8.self_attn.k_proj": 110748,
"model.layers.8.self_attn.q_proj": 103303,
"model.layers.8.self_attn.v_proj": 91497,
"model.layers.9.mlp.fc1": 298086,
"model.layers.9.mlp.fc2": 319091,
"model.layers.9.self_attn.dense": 68561,
"model.layers.9.self_attn.k_proj": 109187,
"model.layers.9.self_attn.q_proj": 103326,
"model.layers.9.self_attn.v_proj": 83167
}
},
"architectures": [
"PhiForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": null,
"embd_pdrop": 0.0,
"eos_token_id": null,
"hidden_act": "gelu_new",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 8192,
"layer_norm_eps": 1e-05,
"max_position_embeddings": 2048,
"model_type": "phi",
"num_attention_heads": 32,
"num_hidden_layers": 24,
"num_key_value_heads": 32,
"partial_rotary_factor": 0.5,
"qk_layernorm": false,
"resid_pdrop": 0.0,
"rope_scaling": null,
"rope_theta": 10000.0,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.39.3",
"use_cache": true,
"vocab_size": 51200
}