nroggendorff commited on
Commit
c62bc4a
1 Parent(s): 240511e

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +4 -4
train.py CHANGED
@@ -36,7 +36,7 @@ class Config:
36
  self.INSTRUCT_FINETUNE_BOOL = False
37
 
38
  # Training steps and warmup
39
- self.FACTOR = 12 ** 3 // 3
40
  self.TOTAL_STEPS = (self.SHARD_SIZE * self.EPOCHS) // (self.BATCH_SIZE * self.GRADIENT_ACCUMULATION_STEPS)
41
  self.WARMUP_STEPS = int(self.TOTAL_STEPS * 0.1)
42
 
@@ -160,11 +160,11 @@ def create_model(tokenizer):
160
  vocab_size=tokenizer.vocab_size,
161
  hidden_size=config.FACTOR,
162
  intermediate_size=config.FACTOR * 4,
163
- num_hidden_layers=12,
164
- num_attention_heads=12,
165
  max_position_embeddings=config.MAX_SEQ_LENGTH,
166
  rms_norm_eps=1e-5,
167
- initializer_range=0.02,
168
  use_cache=True,
169
  pad_token_id=tokenizer.pad_token_id,
170
  bos_token_id=tokenizer.bos_token_id,
 
36
  self.INSTRUCT_FINETUNE_BOOL = False
37
 
38
  # Training steps and warmup
39
+ self.FACTOR = 12 ** 3 // 2
40
  self.TOTAL_STEPS = (self.SHARD_SIZE * self.EPOCHS) // (self.BATCH_SIZE * self.GRADIENT_ACCUMULATION_STEPS)
41
  self.WARMUP_STEPS = int(self.TOTAL_STEPS * 0.1)
42
 
 
160
  vocab_size=tokenizer.vocab_size,
161
  hidden_size=config.FACTOR,
162
  intermediate_size=config.FACTOR * 4,
163
+ num_hidden_layers=config.FACTOR // 2 ** 4,
164
+ num_attention_heads=config.FACTOR // 2 ** 5,
165
  max_position_embeddings=config.MAX_SEQ_LENGTH,
166
  rms_norm_eps=1e-5,
167
+ initializer_range=2e-2,
168
  use_cache=True,
169
  pad_token_id=tokenizer.pad_token_id,
170
  bos_token_id=tokenizer.bos_token_id,