mtasic85 commited on
Commit
33265a4
1 Parent(s): e03ba1c

pretrain model

Browse files
Files changed (1) hide show
  1. scripts/pretrain-model.yaml +5 -4
scripts/pretrain-model.yaml CHANGED
@@ -52,7 +52,7 @@ data:
52
 
53
  init_args:
54
  data_path: "../pretrain-data/"
55
- num_workers: 16
56
 
57
  # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
58
  train:
@@ -77,7 +77,8 @@ train:
77
 
78
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
79
  # max_tokens: 3000000000000
80
- max_tokens: 36852166560 # 3597088 * 2049 * 5
 
81
 
82
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
83
  max_steps:
@@ -114,8 +115,8 @@ eval:
114
  # Optimizer-related arguments
115
  optimizer:
116
  # class_path: torch.optim.AdamW
117
- # class_path: grokadamw.GrokAdamW
118
- class_path: bitsandbytes.optim.AdamW8bit
119
  # class_path: bitsandbytes.optim.PagedAdamW8bit
120
 
121
  init_args:
 
52
 
53
  init_args:
54
  data_path: "../pretrain-data/"
55
+ num_workers: 32
56
 
57
  # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
58
  train:
 
77
 
78
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
79
  # max_tokens: 3000000000000
80
+ # max_tokens: 36852166560 # 3597088 * 2049 * 5
81
+ max_tokens: 22111299936 # 3597088 * 2049 * 3
82
 
83
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
84
  max_steps:
 
115
  # Optimizer-related arguments
116
  optimizer:
117
  # class_path: torch.optim.AdamW
118
+ class_path: grokadamw.GrokAdamW
119
+ # class_path: bitsandbytes.optim.AdamW8bit
120
  # class_path: bitsandbytes.optim.PagedAdamW8bit
121
 
122
  init_args: