train_input: data_processor: "GptHDF5MapDataProcessor" data_dir: "./hdf5/train" batch_size: 32 repeat: True shuffle: True shuffle_seed: 1337 num_workers: 8 prefetch_factor: 10 persistent_workers: True # Important to avoid seeding at each epoch use_vsl: False # IMPORTANT!i eval_input: data_processor: "GptHDF5MapDataProcessor" data_dir: "./hdf5/val" batch_size: 32 shuffle: False persistent_workers: True model: attention_dropout_rate: 0.0 attention_module: multiquery_attention attention_type: scaled_dot_product dropout_rate: 0.0 embedding_dropout_rate: 0.0 embedding_layer_norm: false extra_attention_params: num_kv_groups: 4 filter_size: 5632 hidden_size: 2048 initializer_range: 0.02 layer_norm_epsilon: 1e-05 max_position_embeddings: 2048 mixed_precision: true nonlinearity: swiglu norm_first: true norm_type: rmsnorm num_heads: 32 num_hidden_layers: 22 pos_scaling_factor: 1.0 position_embedding_type: rotary rotary_dim: 64 share_embedding_weights: false use_bias_in_output: false use_ff_layer1_dropout: false use_ffn_bias: false use_ffn_bias_in_attention: false use_projection_bias_in_attention: false vocab_size: 32000 # Task-specific loss_scaling: "num_tokens" loss_weight: 1.0 # Cerebras parameters fp16_type: "cbfloat16" optimizer: optimizer_type: "AdamW" betas: [0.9, 0.95] correct_bias: True weight_decay: 0.0 learning_rate: - scheduler: "Linear" initial_learning_rate: 3.0e-07 end_learning_rate: 2.0e-5 total_iters: 270 #5-10% of total number of steps - scheduler: "CosineDecay" initial_learning_rate: 2.0e-5 end_learning_rate: 1.0e-6 total_iters: 2430 #90-95% of total number max_gradient_norm: 1.0 log_summaries: True loss_scaling_factor: "dynamic" runconfig: max_steps: 2700 # num_epochs * (data_size/batch_size) eval_frequency: 450 eval_steps: 54 # (eval_data_size/eval_batch_size) checkpoint_steps: 900 # save every epoch log_steps: 1 save_initial_checkpoint: False precision_opt_level: 1 seed: 1