File size: 3,510 Bytes
4c44549 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
accum_freq: 1 attn_activation: None attn_name: auto attn_seq_scalar: None attn_seq_scalar_alpha: None average: None average_coefficients: None beta1: 0.9 beta2: 0.95 checkpoint_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_7b-1.0/checkpoints copy_codebase: False data_key: txt dataset_manifest: None dataset_resampled: False dataset_type: auto ddp_static_graph: False debug: False delete_previous_checkpoint: True device: cuda:0 disable_buffer: False dist_backend: nccl dist_url: env:// distill_model: None distill_pretrained: None distributed: False epochs: 5 epochs_cooldown: None eps: 1e-08 experimental_meta_device: False ffn_type: swiglu force_distributed: False force_min_lr: 0.0 fsdp: False fsdp_amp: False fsdp_backward_prefetch: False fsdp_checkpoint: False fsdp_cpu_offload: False fsdp_hybrid: False fsdp_hybrid_o2: False fsdp_limit_all_gathers: False fsdp_pure_bf16: False fsdp_use_orig_params: False global_batch_size: 1 global_val_batch_size: 1 grad_checkpointing: False grad_clip_norm: 1.0 hf_fsdp_block: None hf_model: None hf_seq_len: None ignore_parse_errors: False load_pretrained_state: False local_rank: 0 log_every_n_steps: 20 log_level: 20 log_local: False log_logit_mean: False log_path: /admin/home-sy/dcnlp_logs/c4_original-open_lm_7b-1.0/out.log logs: /admin/home-sy/dcnlp_logs lr: 0.0003 lr_cooldown_end: 3e-05 lr_cooldown_power: 1.0 lr_scheduler: cosine model: open_lm_7b model_norm: lp_layer_norm moe_capacity_factor: 1.25 moe_expert_model_parallelism: False moe_freq: 0 moe_loss_weight: 0.1 moe_num_experts: None moe_top_k: 2 moe_weight_parallelism: False multiple_data_passes: False name: c4_original-open_lm_7b-1.0 no_set_device_rank: False optimizer: adamw per_gpu_batch_size: 1 per_gpu_val_batch_size: 1 positional_embedding_type: rotary precision: amp_bfloat16 pretrained: None qk_norm: True rank: 0 remote_sync: s3://dcnlp-west/dcnlp_experiments_v3 remote_sync_frequency: 300 remote_sync_protocol: s3 report_to: resume: s3://dcnlp-west/dcnlp_experiments_v3/c4_original-open_lm_7b-1.0/checkpoints/epoch_17.pt save_frequency: 1 save_most_recent: False seed: 124 seq_len: 2048 skip_scheduler: False squash_mask_left: True target_mask_individual: 50400 target_mask_left: 50300 tensorboard: False tensorboard_path: torchcompile: False torchscript: False trace: False train_data: None train_data_mix_weights: None train_data_upsampling_factors: None train_num_samples: None use_bn_sync: False use_bnb_linear: None val_data: ['training/eval_data/val_tok_mult/de-en/val_de-en_000.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_010.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_020.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_030.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_040.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_050.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_060.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_070.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_080.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_090.tar', 'training/eval_data/val_tok_mult/de-en/val_de-en_100.tar'] val_data_key: ['json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz', 'json.gz'] val_frequency: 5 val_iter_ci: 10000 val_max_pop_ci: 300000 val_num_samples: None val_seq_ci: True val_tok_ci: True vocab_size: 50432 wandb: False wandb_notes: wandb_project_name: open-lm warmup: 5000 wd: 0.33 workers: 2 world_size: 1 z_loss_coefficient: 0.0001 |