medardif commited on
Commit
a0855c5
1 Parent(s): 9cd37af

Upload LlamaForCausalLM

Browse files
config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "_name_or_path": "meta-llama/Meta-Llama-3-8B",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 128000,
9
- "eos_token_id": 128001,
10
  "hidden_act": "silu",
11
  "hidden_size": 4096,
12
  "initializer_range": 0.02,
 
1
  {
2
+ "_name_or_path": "meta-llama/Meta-Llama-3-8B-instruct",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
6
  "attention_bias": false,
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 128000,
9
+ "eos_token_id": 128009,
10
  "hidden_act": "silu",
11
  "hidden_size": 4096,
12
  "initializer_range": 0.02,
generation_config.json CHANGED
@@ -1,7 +1,10 @@
1
  {
2
  "bos_token_id": 128000,
3
  "do_sample": true,
4
- "eos_token_id": 128001,
 
 
 
5
  "max_length": 4096,
6
  "temperature": 0.6,
7
  "top_p": 0.9,
 
1
  {
2
  "bos_token_id": 128000,
3
  "do_sample": true,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128009
7
+ ],
8
  "max_length": 4096,
9
  "temperature": 0.6,
10
  "top_p": 0.9,
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d41473ac2c2bf03fee50a52fc202e433fabb9c9c2f32a66f7d290634d59a0fa
3
- size 4994513900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f4d36ebcda7dafe986b657b34d47958ecdd37b6ae2c997a140afaddee73dd29
3
+ size 4983490991
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2afe22afc39607165c7dc0c62dc135f8558d0a0406f9ea25133fd445c0690b11
3
- size 2894068673
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ab23e701e017b606c350c3ec215e11c37ca4ed81ab22086f1f377075b536c52
3
+ size 2988978828
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 7888342613
4
  },
5
  "weight_map": {
6
  "lm_head.modules_to_save.default.weight": "model-00002-of-00002.safetensors",
@@ -993,15 +993,15 @@
993
  "model.layers.23.self_attn.v_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
994
  "model.layers.23.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
995
  "model.layers.23.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
996
- "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
997
- "model.layers.24.mlp.down_proj.base_layer.weight": "model-00001-of-00002.safetensors",
998
- "model.layers.24.mlp.down_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
999
- "model.layers.24.mlp.down_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
1000
- "model.layers.24.mlp.down_proj.base_layer.weight.nested_quant_map": "model-00001-of-00002.safetensors",
1001
- "model.layers.24.mlp.down_proj.base_layer.weight.quant_map": "model-00001-of-00002.safetensors",
1002
- "model.layers.24.mlp.down_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
1003
- "model.layers.24.mlp.down_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
1004
- "model.layers.24.mlp.down_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
1005
  "model.layers.24.mlp.gate_proj.base_layer.weight": "model-00001-of-00002.safetensors",
1006
  "model.layers.24.mlp.gate_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
1007
  "model.layers.24.mlp.gate_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
@@ -1010,15 +1010,15 @@
1010
  "model.layers.24.mlp.gate_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
1011
  "model.layers.24.mlp.gate_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
1012
  "model.layers.24.mlp.gate_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
1013
- "model.layers.24.mlp.up_proj.base_layer.weight": "model-00001-of-00002.safetensors",
1014
- "model.layers.24.mlp.up_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
1015
- "model.layers.24.mlp.up_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
1016
- "model.layers.24.mlp.up_proj.base_layer.weight.nested_quant_map": "model-00001-of-00002.safetensors",
1017
- "model.layers.24.mlp.up_proj.base_layer.weight.quant_map": "model-00001-of-00002.safetensors",
1018
- "model.layers.24.mlp.up_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
1019
- "model.layers.24.mlp.up_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
1020
- "model.layers.24.mlp.up_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
1021
- "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
1022
  "model.layers.24.self_attn.k_proj.base_layer.weight": "model-00001-of-00002.safetensors",
1023
  "model.layers.24.self_attn.k_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
1024
  "model.layers.24.self_attn.k_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
@@ -1077,14 +1077,14 @@
1077
  "model.layers.25.mlp.up_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1078
  "model.layers.25.mlp.up_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1079
  "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
1080
- "model.layers.25.self_attn.k_proj.base_layer.weight": "model-00001-of-00002.safetensors",
1081
- "model.layers.25.self_attn.k_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
1082
- "model.layers.25.self_attn.k_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
1083
- "model.layers.25.self_attn.k_proj.base_layer.weight.nested_quant_map": "model-00001-of-00002.safetensors",
1084
- "model.layers.25.self_attn.k_proj.base_layer.weight.quant_map": "model-00001-of-00002.safetensors",
1085
- "model.layers.25.self_attn.k_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
1086
- "model.layers.25.self_attn.k_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
1087
- "model.layers.25.self_attn.k_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
1088
  "model.layers.25.self_attn.o_proj.base_layer.weight": "model-00002-of-00002.safetensors",
1089
  "model.layers.25.self_attn.o_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",
1090
  "model.layers.25.self_attn.o_proj.base_layer.weight.nested_absmax": "model-00002-of-00002.safetensors",
@@ -1093,22 +1093,22 @@
1093
  "model.layers.25.self_attn.o_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00002-of-00002.safetensors",
1094
  "model.layers.25.self_attn.o_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1095
  "model.layers.25.self_attn.o_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1096
- "model.layers.25.self_attn.q_proj.base_layer.weight": "model-00001-of-00002.safetensors",
1097
- "model.layers.25.self_attn.q_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
1098
- "model.layers.25.self_attn.q_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
1099
- "model.layers.25.self_attn.q_proj.base_layer.weight.nested_quant_map": "model-00001-of-00002.safetensors",
1100
- "model.layers.25.self_attn.q_proj.base_layer.weight.quant_map": "model-00001-of-00002.safetensors",
1101
- "model.layers.25.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
1102
- "model.layers.25.self_attn.q_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
1103
- "model.layers.25.self_attn.q_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
1104
- "model.layers.25.self_attn.v_proj.base_layer.weight": "model-00001-of-00002.safetensors",
1105
- "model.layers.25.self_attn.v_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
1106
- "model.layers.25.self_attn.v_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
1107
- "model.layers.25.self_attn.v_proj.base_layer.weight.nested_quant_map": "model-00001-of-00002.safetensors",
1108
- "model.layers.25.self_attn.v_proj.base_layer.weight.quant_map": "model-00001-of-00002.safetensors",
1109
- "model.layers.25.self_attn.v_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
1110
- "model.layers.25.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
1111
- "model.layers.25.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
1112
  "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
1113
  "model.layers.26.mlp.down_proj.base_layer.weight": "model-00002-of-00002.safetensors",
1114
  "model.layers.26.mlp.down_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 7972228691
4
  },
5
  "weight_map": {
6
  "lm_head.modules_to_save.default.weight": "model-00002-of-00002.safetensors",
 
993
  "model.layers.23.self_attn.v_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
994
  "model.layers.23.self_attn.v_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
995
  "model.layers.23.self_attn.v_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
996
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
997
+ "model.layers.24.mlp.down_proj.base_layer.weight": "model-00002-of-00002.safetensors",
998
+ "model.layers.24.mlp.down_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",
999
+ "model.layers.24.mlp.down_proj.base_layer.weight.nested_absmax": "model-00002-of-00002.safetensors",
1000
+ "model.layers.24.mlp.down_proj.base_layer.weight.nested_quant_map": "model-00002-of-00002.safetensors",
1001
+ "model.layers.24.mlp.down_proj.base_layer.weight.quant_map": "model-00002-of-00002.safetensors",
1002
+ "model.layers.24.mlp.down_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00002-of-00002.safetensors",
1003
+ "model.layers.24.mlp.down_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1004
+ "model.layers.24.mlp.down_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1005
  "model.layers.24.mlp.gate_proj.base_layer.weight": "model-00001-of-00002.safetensors",
1006
  "model.layers.24.mlp.gate_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
1007
  "model.layers.24.mlp.gate_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
 
1010
  "model.layers.24.mlp.gate_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00001-of-00002.safetensors",
1011
  "model.layers.24.mlp.gate_proj.lora_A.default.weight": "model-00001-of-00002.safetensors",
1012
  "model.layers.24.mlp.gate_proj.lora_B.default.weight": "model-00001-of-00002.safetensors",
1013
+ "model.layers.24.mlp.up_proj.base_layer.weight": "model-00002-of-00002.safetensors",
1014
+ "model.layers.24.mlp.up_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",
1015
+ "model.layers.24.mlp.up_proj.base_layer.weight.nested_absmax": "model-00002-of-00002.safetensors",
1016
+ "model.layers.24.mlp.up_proj.base_layer.weight.nested_quant_map": "model-00002-of-00002.safetensors",
1017
+ "model.layers.24.mlp.up_proj.base_layer.weight.quant_map": "model-00002-of-00002.safetensors",
1018
+ "model.layers.24.mlp.up_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00002-of-00002.safetensors",
1019
+ "model.layers.24.mlp.up_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1020
+ "model.layers.24.mlp.up_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1021
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
1022
  "model.layers.24.self_attn.k_proj.base_layer.weight": "model-00001-of-00002.safetensors",
1023
  "model.layers.24.self_attn.k_proj.base_layer.weight.absmax": "model-00001-of-00002.safetensors",
1024
  "model.layers.24.self_attn.k_proj.base_layer.weight.nested_absmax": "model-00001-of-00002.safetensors",
 
1077
  "model.layers.25.mlp.up_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1078
  "model.layers.25.mlp.up_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1079
  "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
1080
+ "model.layers.25.self_attn.k_proj.base_layer.weight": "model-00002-of-00002.safetensors",
1081
+ "model.layers.25.self_attn.k_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",
1082
+ "model.layers.25.self_attn.k_proj.base_layer.weight.nested_absmax": "model-00002-of-00002.safetensors",
1083
+ "model.layers.25.self_attn.k_proj.base_layer.weight.nested_quant_map": "model-00002-of-00002.safetensors",
1084
+ "model.layers.25.self_attn.k_proj.base_layer.weight.quant_map": "model-00002-of-00002.safetensors",
1085
+ "model.layers.25.self_attn.k_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00002-of-00002.safetensors",
1086
+ "model.layers.25.self_attn.k_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1087
+ "model.layers.25.self_attn.k_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1088
  "model.layers.25.self_attn.o_proj.base_layer.weight": "model-00002-of-00002.safetensors",
1089
  "model.layers.25.self_attn.o_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",
1090
  "model.layers.25.self_attn.o_proj.base_layer.weight.nested_absmax": "model-00002-of-00002.safetensors",
 
1093
  "model.layers.25.self_attn.o_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00002-of-00002.safetensors",
1094
  "model.layers.25.self_attn.o_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1095
  "model.layers.25.self_attn.o_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1096
+ "model.layers.25.self_attn.q_proj.base_layer.weight": "model-00002-of-00002.safetensors",
1097
+ "model.layers.25.self_attn.q_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",
1098
+ "model.layers.25.self_attn.q_proj.base_layer.weight.nested_absmax": "model-00002-of-00002.safetensors",
1099
+ "model.layers.25.self_attn.q_proj.base_layer.weight.nested_quant_map": "model-00002-of-00002.safetensors",
1100
+ "model.layers.25.self_attn.q_proj.base_layer.weight.quant_map": "model-00002-of-00002.safetensors",
1101
+ "model.layers.25.self_attn.q_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00002-of-00002.safetensors",
1102
+ "model.layers.25.self_attn.q_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1103
+ "model.layers.25.self_attn.q_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1104
+ "model.layers.25.self_attn.v_proj.base_layer.weight": "model-00002-of-00002.safetensors",
1105
+ "model.layers.25.self_attn.v_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",
1106
+ "model.layers.25.self_attn.v_proj.base_layer.weight.nested_absmax": "model-00002-of-00002.safetensors",
1107
+ "model.layers.25.self_attn.v_proj.base_layer.weight.nested_quant_map": "model-00002-of-00002.safetensors",
1108
+ "model.layers.25.self_attn.v_proj.base_layer.weight.quant_map": "model-00002-of-00002.safetensors",
1109
+ "model.layers.25.self_attn.v_proj.base_layer.weight.quant_state.bitsandbytes__nf4": "model-00002-of-00002.safetensors",
1110
+ "model.layers.25.self_attn.v_proj.lora_A.default.weight": "model-00002-of-00002.safetensors",
1111
+ "model.layers.25.self_attn.v_proj.lora_B.default.weight": "model-00002-of-00002.safetensors",
1112
  "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
1113
  "model.layers.26.mlp.down_proj.base_layer.weight": "model-00002-of-00002.safetensors",
1114
  "model.layers.26.mlp.down_proj.base_layer.weight.absmax": "model-00002-of-00002.safetensors",