train_input:
  data_processor: "GptHDF5MapDataProcessor"
  data_dir: "./hdf5/train" 
  batch_size: 32
  repeat: True
  shuffle: True
  shuffle_seed: 1337
  num_workers: 8
  prefetch_factor: 10
  persistent_workers: True # Important to avoid seeding at each epoch
  use_vsl: False   # IMPORTANT!i

eval_input:
  data_processor: "GptHDF5MapDataProcessor"
  data_dir: "./hdf5/val"
  batch_size: 32
  shuffle: False
  persistent_workers: True

model:
  attention_dropout_rate: 0.0
  attention_module: multiquery_attention
  attention_type: scaled_dot_product
  dropout_rate: 0.0
  embedding_dropout_rate: 0.0
  embedding_layer_norm: false
  extra_attention_params:
      num_kv_groups: 4
  filter_size: 5632
  hidden_size: 2048
  initializer_range: 0.02
  layer_norm_epsilon: 1e-05
  max_position_embeddings: 2048
  mixed_precision: true
  nonlinearity: swiglu
  norm_first: true
  norm_type: rmsnorm
  num_heads: 32
  num_hidden_layers: 22
  pos_scaling_factor: 1.0
  position_embedding_type: rotary
  rotary_dim: 64
  share_embedding_weights: false
  use_bias_in_output: false
  use_ff_layer1_dropout: false
  use_ffn_bias: false
  use_ffn_bias_in_attention: false
  use_projection_bias_in_attention: false
  vocab_size: 32000

  # Task-specific
  loss_scaling: "num_tokens"
  loss_weight: 1.0

  # Cerebras parameters
  fp16_type: "cbfloat16"

optimizer:
    optimizer_type: "AdamW"
    betas: [0.9, 0.95]
    correct_bias: True
    weight_decay: 0.0
    learning_rate:
        - scheduler: "Linear"
          initial_learning_rate: 3.0e-07
          end_learning_rate: 2.0e-5
          total_iters: 270 #5-10% of total number of steps
        - scheduler: "CosineDecay"
          initial_learning_rate: 2.0e-5
          end_learning_rate: 1.0e-6
          total_iters: 2430 #90-95% of total number
    max_gradient_norm: 1.0
    log_summaries: True
    loss_scaling_factor: "dynamic"

runconfig:
    max_steps: 2700 # num_epochs * (data_size/batch_size)
    eval_frequency: 450 
    eval_steps: 54 # (eval_data_size/eval_batch_size)
    checkpoint_steps: 900 # save every epoch
    log_steps: 1
    save_initial_checkpoint: False
    precision_opt_level: 1
    seed: 1