NeMo
okuchaiev commited on
Commit
365b81f
1 Parent(s): e3d6f77

Add files using large-upload tool

Browse files
Files changed (1) hide show
  1. model_config.yaml +261 -0
model_config.yaml ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mcore_gpt: true
2
+ micro_batch_size: 1
3
+ global_batch_size: 256
4
+ tensor_model_parallel_size: 8
5
+ pipeline_model_parallel_size: 4
6
+ virtual_pipeline_model_parallel_size: null
7
+ encoder_seq_length: 4096
8
+ max_position_embeddings: 4096
9
+ num_layers: 96
10
+ hidden_size: 18432
11
+ ffn_hidden_size: 73728
12
+ num_attention_heads: 96
13
+ init_method_std: 0.0063
14
+ use_scaled_init_method: true
15
+ hidden_dropout: 0.0
16
+ attention_dropout: 0.0
17
+ ffn_dropout: 0.0
18
+ kv_channels: null
19
+ apply_query_key_layer_scaling: true
20
+ normalization: layernorm1p
21
+ layernorm_epsilon: 1.0e-05
22
+ do_layer_norm_weight_decay: false
23
+ make_vocab_size_divisible_by: 128
24
+ pre_process: true
25
+ post_process: true
26
+ persist_layer_norm: true
27
+ bias: false
28
+ activation: squared-relu
29
+ headscale: false
30
+ transformer_block_type: pre_ln
31
+ openai_gelu: false
32
+ normalize_attention_scores: true
33
+ position_embedding_type: rope
34
+ rotary_percentage: 0.5
35
+ attention_type: multihead
36
+ share_embeddings_and_output_weights: false
37
+ num_query_groups: 8
38
+ tokenizer:
39
+ library: sentencepiece
40
+ type: null
41
+ model: nemo:8223bf8eaa194eb8920af568bb52e2d0_megatron_2.model
42
+ vocab_file: null
43
+ merge_file: null
44
+ tokenizer_model: nemo:eb5528fdec5c4083affa2c97958eeef7_megatron_2.model
45
+ sentencepiece_legacy: false
46
+ native_amp_init_scale: 4294967296
47
+ native_amp_growth_interval: 1000
48
+ hysteresis: 2
49
+ fp32_residual_connection: false
50
+ fp16_lm_cross_entropy: false
51
+ megatron_amp_O2: true
52
+ grad_allreduce_chunk_size_mb: 125
53
+ grad_div_ar_fusion: true
54
+ gradient_accumulation_fusion: false
55
+ bias_activation_fusion: false
56
+ bias_dropout_add_fusion: false
57
+ masked_softmax_fusion: true
58
+ seed: 1234
59
+ resume_from_checkpoint: null
60
+ use_cpu_initialization: false
61
+ onnx_safe: false
62
+ apex_transformer_log_level: 30
63
+ gradient_as_bucket_view: false
64
+ sync_batch_comm: false
65
+ activations_checkpoint_granularity: null
66
+ activations_checkpoint_method: null
67
+ activations_checkpoint_num_layers: 1
68
+ num_micro_batches_with_partial_activation_checkpoints: null
69
+ activations_checkpoint_layers_per_pipeline: null
70
+ sequence_parallel: false
71
+ transformer_engine: false
72
+ fp8: false
73
+ fp8_e4m3: false
74
+ fp8_hybrid: false
75
+ fp8_margin: 0
76
+ fp8_interval: 1
77
+ fp8_amax_history_len: 1
78
+ fp8_amax_compute_algo: most_recent
79
+ reduce_amax: true
80
+ use_emha: false
81
+ optim:
82
+ name: distributed_fused_adam
83
+ lr: 3.002e-07
84
+ weight_decay: 0.1
85
+ betas:
86
+ - 0.9
87
+ - 0.98
88
+ sched:
89
+ name: CosineAnnealing
90
+ warmup_steps: 10
91
+ constant_steps: 400
92
+ min_lr: 3.0e-07
93
+ bucket_cap_mb: 200
94
+ overlap_grad_sync: false
95
+ contiguous_grad_buffer: true
96
+ precision: bf16-mixed
97
+ data:
98
+ chat: true
99
+ chat_prompt_tokens:
100
+ system_turn_start: <extra_id_0>
101
+ turn_start: <extra_id_1>
102
+ label_start: <extra_id_2>
103
+ end_of_turn: '
104
+
105
+ '
106
+ end_of_name: '
107
+
108
+ '
109
+ sample: true
110
+ num_workers: 2
111
+ dataloader_type: single
112
+ train_ds:
113
+ file_path: /dataset/daring-anteater_commercial.shuf.removelong.jsonl
114
+ global_batch_size: 128
115
+ micro_batch_size: 1
116
+ shuffle: true
117
+ memmap_workers: null
118
+ max_seq_length: 4096
119
+ min_seq_length: 1
120
+ drop_last: true
121
+ concat_sampling_probabilities: null
122
+ label_key: output
123
+ add_eos: false
124
+ add_sep: false
125
+ add_bos: false
126
+ truncation_field: input
127
+ index_mapping_dir: /indexmap_dir
128
+ prompt_template: '<extra_id_0>System
129
+
130
+ {system message}
131
+
132
+ <extra_id_1>User
133
+
134
+ {turn 1 user message}
135
+
136
+ <extra_id_1>Assistant
137
+
138
+ <extra_id_2>{turn 1 assistant label}
139
+
140
+ {turn 1 assistant message}
141
+
142
+ <extra_id_1>User
143
+
144
+ {turn 2 user message}
145
+
146
+ <extra_id_1>Assistant
147
+
148
+ <extra_id_2>{turn 2 assistant label}
149
+
150
+ {turn 2 assistant message}
151
+
152
+ <extra_id_1>'
153
+ hf_dataset: true
154
+ truncation_method: right
155
+ validation_ds:
156
+ file_path: /dataset/daring-anteater_commercial.shuf.removelong.jsonl
157
+ names: null
158
+ global_batch_size: 128
159
+ micro_batch_size: 1
160
+ shuffle: false
161
+ memmap_workers: null
162
+ max_seq_length: 4096
163
+ min_seq_length: 1
164
+ drop_last: false
165
+ label_key: output
166
+ add_eos: false
167
+ add_sep: false
168
+ add_bos: false
169
+ write_predictions_to_file: false
170
+ output_file_path_prefix: null
171
+ truncation_field: input
172
+ index_mapping_dir: /indexmap_dir
173
+ prompt_template: '<extra_id_0>System
174
+
175
+ {system message}
176
+
177
+ <extra_id_1>User
178
+
179
+ {turn 1 user message}
180
+
181
+ <extra_id_1>Assistant
182
+
183
+ <extra_id_2>{turn 1 assistant label}
184
+
185
+ {turn 1 assistant message}
186
+
187
+ <extra_id_1>User
188
+
189
+ {turn 2 user message}
190
+
191
+ <extra_id_1>Assistant
192
+
193
+ <extra_id_2>{turn 2 assistant label}
194
+
195
+ {turn 2 assistant message}
196
+
197
+ <extra_id_1>'
198
+ tokens_to_generate: 32
199
+ hf_dataset: true
200
+ truncation_method: right
201
+ metric:
202
+ name: loss
203
+ average: null
204
+ num_classes: null
205
+ test_ds:
206
+ prompt_template: '<extra_id_0>System
207
+
208
+ {system message}
209
+
210
+ <extra_id_1>User
211
+
212
+ {turn 1 user message}
213
+
214
+ <extra_id_1>Assistant
215
+
216
+ <extra_id_2>{turn 1 assistant label}
217
+
218
+ {turn 1 assistant message}
219
+
220
+ <extra_id_1>User
221
+
222
+ {turn 2 user message}
223
+
224
+ <extra_id_1>Assistant
225
+
226
+ <extra_id_2>{turn 2 assistant label}
227
+
228
+ {turn 2 assistant message}
229
+
230
+ <extra_id_1>'
231
+ data_impl: jsonl
232
+ splits_string: null
233
+ seq_length: 4096
234
+ skip_warmup: true
235
+ reset_position_ids: false
236
+ reset_attention_mask: false
237
+ eod_mask_loss: false
238
+ index_mapping_dir: /indexmap_dir
239
+ data_prefix:
240
+ train:
241
+ - /datasets/v30_benign-walrus_clip153600.jsonl
242
+ validation:
243
+ - /datasets/v30_benign-walrus_clip153600.jsonl
244
+ test:
245
+ - /datasets/v30_benign-walrus_clip153600.jsonl
246
+ answer_only_loss: true
247
+ restore_from_path: /models/340B_100p_CT_100B
248
+ save_nemo_on_validation_end: true
249
+ use_flash_attention: null
250
+ pipeline_model_parallel_split_rank: 0
251
+ dpo:
252
+ log_prob_forward_micro_batch_size: 2
253
+ ref_policy_kl_penalty: 0.3
254
+ average_log_probs: false
255
+ sft_loss_coeff: 1.0e-05
256
+ optimize_ref_policy_kl_penalty: false
257
+ preference_loss: reward_rev_dpo
258
+ gt_reward_scale: 1.0
259
+ apply_rope_fusion: false
260
+ target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel
261
+ nemo_version: 1.22.0