Files changed (1) hide show
  1. train.py +63 -53
train.py CHANGED
@@ -21,33 +21,56 @@ handler = StreamHandler()
21
  logger.addHandler(handler)
22
 
23
  class Config:
24
- # Model and training hyperparameters
25
- BATCH_SIZE = 16
26
- EPOCHS = 3
27
- LEARNING_RATE = 2e-4
28
- MAX_SEQ_LENGTH = 512
29
- VOCAB_SIZE = 32000
30
- FP16 = True
31
- WEIGHT_DECAY = 1e-3
32
- GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // 4
33
-
34
- # Dataset configurations
35
- INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
36
- INSTRUCT_DATASET = "nroggendorff/elephant"
37
- SHARD_SIZE = int(2e+5)
38
-
39
- # Output and repo settings
40
- OUTPUT_REPO = "nroggendorff/smallama"
41
- PUSH_TO_HUB = True
42
- INSTRUCT_FINETUNE_BOOL = False
43
-
44
- # Training steps and warmup
45
- FACTOR = 12 ** 3 // 3
46
- TOTAL_STEPS = (SHARD_SIZE * EPOCHS) // (BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)
47
- WARMUP_STEPS = int(TOTAL_STEPS * 0.1)
48
-
49
- # Initial state for shard offset
50
- INIT = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  class Space:
53
  def __init__(self):
@@ -71,14 +94,14 @@ def encode_decode(texts, tokenizer):
71
  if tokenizer.pad_token is None:
72
  tokenizer.pad_token = tokenizer.eos_token
73
  tokenized_texts = tokenizer(
74
- texts, padding="max_length", truncation=True, max_length=Config.MAX_SEQ_LENGTH, return_tensors="pt"
75
  ).input_ids
76
- return tokenizer.batch_decode(tokenized_texts) if tokenized_texts.dim() >= 1 else [tokenizer.pad_token * Config.MAX_SEQ_LENGTH]
77
 
78
  def create_tokenizer(training_corpus):
79
  tokenizer = ByteLevelBPETokenizer()
80
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
81
- tokenizer.train_from_iterator(training_corpus, vocab_size=Config.VOCAB_SIZE, min_frequency=2, special_tokens=special_tokens)
82
  return PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
83
 
84
  def load_tokenizer(repo: str):
@@ -111,11 +134,11 @@ def format_prompts(examples, tokenizer, is_instructional):
111
  def create_model(tokenizer):
112
  config = LlamaConfig(
113
  vocab_size=tokenizer.vocab_size,
114
- hidden_size=Config.FACTOR,
115
- intermediate_size=Config.FACTOR * 4,
116
  num_hidden_layers=12,
117
  num_attention_heads=12,
118
- max_position_embeddings=Config.MAX_SEQ_LENGTH,
119
  rms_norm_eps=1e-5,
120
  initializer_range=0.02,
121
  use_cache=True,
@@ -127,20 +150,7 @@ def create_model(tokenizer):
127
  return LlamaForCausalLM(config)
128
 
129
  def train_model(model, tokenizer, dataset, push_to_hub, is_instructional):
130
- config = SFTConfig(
131
- output_dir="model",
132
- num_train_epochs=Config.EPOCHS,
133
- per_device_train_batch_size=Config.BATCH_SIZE,
134
- learning_rate=Config.LEARNING_RATE,
135
- warmup_steps=Config.WARMUP_STEPS,
136
- weight_decay=Config.WEIGHT_DECAY,
137
- gradient_accumulation_steps=Config.GRADIENT_ACCUMULATION_STEPS,
138
- fp16=Config.FP16,
139
- save_steps=int(Config.WARMUP_STEPS * 5),
140
- logging_steps=int(Config.WARMUP_STEPS),
141
- save_total_limit=2,
142
- report_to="none",
143
- )
144
  dataset = dataset.map(
145
  lambda examples: format_prompts(examples, tokenizer, is_instructional),
146
  batched=True,
@@ -155,7 +165,7 @@ def train_model(model, tokenizer, dataset, push_to_hub, is_instructional):
155
  train_result = trainer.train()
156
 
157
  if push_to_hub:
158
- repo_id = Config.OUTPUT_REPO + "-it" if Config.INSTRUCT_FINETUNE_BOOL else Config.OUTPUT_REPO
159
  trainer.model.push_to_hub(repo_id, commit_message=f"Training loss: {train_result.training_loss:.4f}", force=True)
160
  trainer.tokenizer.push_to_hub(repo_id, commit_message=f"Training loss: {train_result.training_loss:.4f}", force=True)
161
  else:
@@ -163,18 +173,18 @@ def train_model(model, tokenizer, dataset, push_to_hub, is_instructional):
163
  trainer.tokenizer.save_pretrained("tokenizer")
164
 
165
  def main():
166
- dataset = load_data(Config.INPUT_DATASET, "train", Config.SHARD_SIZE, Config.INIT)
167
  tokenizer = (
168
- load_tokenizer(Config.OUTPUT_REPO)
169
- if Config.INSTRUCT_FINETUNE_BOOL and Config.INIT > 0
170
  else create_tokenizer(get_training_corpus(dataset))
171
  )
172
  model = (
173
  load_model()
174
- if Config.INSTRUCT_FINETUNE_BOOL or Config.INIT > 0
175
  else create_model(tokenizer)
176
  )
177
- train_model(model, tokenizer, dataset, Config.PUSH_TO_HUB, Config.INSTRUCT_FINETUNE_BOOL)
178
 
179
  if __name__ == "__main__":
180
  try:
 
21
  logger.addHandler(handler)
22
 
23
  class Config:
24
+ def __init__(self):
25
+ # Model and training hyperparameters
26
+ self.BATCH_SIZE = 16
27
+ self.EPOCHS = 3
28
+ self.LEARNING_RATE = 2e-4
29
+ self.MAX_SEQ_LENGTH = 512
30
+ self.VOCAB_SIZE = 32000
31
+ self.FP16 = True
32
+ self.WEIGHT_DECAY = 1e-3
33
+ self.GRADIENT_ACCUMULATION_STEPS = self.BATCH_SIZE // 4
34
+
35
+ # Dataset configurations
36
+ self.INPUT_DATASET = "HuggingFaceTB/smollm-corpus"
37
+ self.INSTRUCT_DATASET = "nroggendorff/elephant"
38
+ self.SHARD_SIZE = int(2e+5)
39
+
40
+ # Output and repo settings
41
+ self.OUTPUT_REPO = "nroggendorff/smallama"
42
+ self.PUSH_TO_HUB = True
43
+ self.INSTRUCT_FINETUNE_BOOL = False
44
+
45
+ # Training steps and warmup
46
+ self.FACTOR = 12 ** 3 // 3
47
+ self.TOTAL_STEPS = (self.SHARD_SIZE * self.EPOCHS) // (self.BATCH_SIZE * self.GRADIENT_ACCUMULATION_STEPS)
48
+ self.WARMUP_STEPS = int(self.TOTAL_STEPS * 0.1)
49
+
50
+ # Initial state for shard offset
51
+ self.INIT = 0
52
+
53
+ # ignore
54
+ self.getConfig = lambda: self._args()
55
+
56
+ # @staticmethod
57
+ def _args(self):
58
+ return SFTConfig(
59
+ output_dir="model",
60
+ num_train_epochs=self.EPOCHS,
61
+ per_device_train_batch_size=self.BATCH_SIZE,
62
+ learning_rate=self.LEARNING_RATE,
63
+ warmup_steps=self.WARMUP_STEPS,
64
+ weight_decay=self.WEIGHT_DECAY,
65
+ gradient_accumulation_steps=self.GRADIENT_ACCUMULATION_STEPS,
66
+ fp16=self.FP16,
67
+ save_steps=int(self.WARMUP_STEPS * 5),
68
+ logging_steps=int(self.WARMUP_STEPS),
69
+ save_total_limit=2,
70
+ report_to="none",
71
+ )
72
+
73
+ config = Config().getConfig()
74
 
75
  class Space:
76
  def __init__(self):
 
94
  if tokenizer.pad_token is None:
95
  tokenizer.pad_token = tokenizer.eos_token
96
  tokenized_texts = tokenizer(
97
+ texts, padding="max_length", truncation=True, max_length=config.MAX_SEQ_LENGTH, return_tensors="pt"
98
  ).input_ids
99
+ return tokenizer.batch_decode(tokenized_texts) if tokenized_texts.dim() >= 1 else [tokenizer.pad_token * config.MAX_SEQ_LENGTH]
100
 
101
  def create_tokenizer(training_corpus):
102
  tokenizer = ByteLevelBPETokenizer()
103
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
104
+ tokenizer.train_from_iterator(training_corpus, vocab_size=config.VOCAB_SIZE, min_frequency=2, special_tokens=special_tokens)
105
  return PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
106
 
107
  def load_tokenizer(repo: str):
 
134
  def create_model(tokenizer):
135
  config = LlamaConfig(
136
  vocab_size=tokenizer.vocab_size,
137
+ hidden_size=config.FACTOR,
138
+ intermediate_size=config.FACTOR * 4,
139
  num_hidden_layers=12,
140
  num_attention_heads=12,
141
+ max_position_embeddings=config.MAX_SEQ_LENGTH,
142
  rms_norm_eps=1e-5,
143
  initializer_range=0.02,
144
  use_cache=True,
 
150
  return LlamaForCausalLM(config)
151
 
152
  def train_model(model, tokenizer, dataset, push_to_hub, is_instructional):
153
+ config =
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  dataset = dataset.map(
155
  lambda examples: format_prompts(examples, tokenizer, is_instructional),
156
  batched=True,
 
165
  train_result = trainer.train()
166
 
167
  if push_to_hub:
168
+ repo_id = config.OUTPUT_REPO + "-it" if config.INSTRUCT_FINETUNE_BOOL else config.OUTPUT_REPO
169
  trainer.model.push_to_hub(repo_id, commit_message=f"Training loss: {train_result.training_loss:.4f}", force=True)
170
  trainer.tokenizer.push_to_hub(repo_id, commit_message=f"Training loss: {train_result.training_loss:.4f}", force=True)
171
  else:
 
173
  trainer.tokenizer.save_pretrained("tokenizer")
174
 
175
  def main():
176
+ dataset = load_data(config.INPUT_DATASET, "train", config.SHARD_SIZE, config.INIT)
177
  tokenizer = (
178
+ load_tokenizer(config.OUTPUT_REPO)
179
+ if config.INSTRUCT_FINETUNE_BOOL and config.INIT > 0
180
  else create_tokenizer(get_training_corpus(dataset))
181
  )
182
  model = (
183
  load_model()
184
+ if config.INSTRUCT_FINETUNE_BOOL or config.INIT > 0
185
  else create_model(tokenizer)
186
  )
187
+ train_model(model, tokenizer, dataset, config.PUSH_TO_HUB, config.INSTRUCT_FINETUNE_BOOL)
188
 
189
  if __name__ == "__main__":
190
  try: