import os
import torch
import trl
from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, TrainingArguments, PreTrainedTokenizerFast, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 64
EPOCHS = 4
LEARNING_RATE = 2e-4
FACTOR = 4
VOCAB_SIZE = 32000
INPUT_DATASET = "nroggendorff/oak"
OUTPUT_REPO = "smallama"
FP16 = True
WARMUP_STEPS = 500
DECAY = 0.01
GRADIENT_ACCUMULATION_STEPS = 4
CLIPPING = 1.0
PUSH_TO_HUB = True
def load_data():
dataset = load_dataset(INPUT_DATASET, split="train")
return dataset
def create_tokenizer(training_corpus):
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(
training_corpus,
vocab_size=VOCAB_SIZE,
min_frequency=2,
special_tokens=["", "", "", "", "", "<|user|>", "<|bot|>", "<|end|>"]
)
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer)
return fast_tokenizer
def get_training_corpus(dataset):
for i in range(0, len(dataset), 1000):
yield dataset[i : i + 1000]["text"]
def format_prompts(examples, tokenizer):
texts = []
for text in examples['text']:
conversation = []
parts = text.split('<|end|>')
for i in range(0, len(parts) - 1, 2):
prompt = parts[i].replace("<|user|>", "")
response = parts[i + 1].replace("<|bot|>", "")
conversation.append({"role": "user", "content": prompt})
conversation.append({"role": "assistant", "content": response})
formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False)
texts.append(formatted_conversation)
return {"text": texts}
def create_model(tokenizer):
config = LlamaConfig(
vocab_size=tokenizer.vocab_size,
hidden_size=FACTOR,
intermediate_size=FACTOR * 4,
num_hidden_layers=max(1, FACTOR // 32),
num_attention_heads=max(1, FACTOR // 64),
max_position_embeddings=MAX_SEQ_LENGTH,
rms_norm_eps=1e-6,
initializer_range=0.02,
use_cache=True,
pad_token_id=tokenizer.pad_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
tie_word_embeddings=False,
)
model = LlamaForCausalLM(config)
return model
def configure_tokenizer(tokenizer):
special_tokens = {
"bos_token": "",
"eos_token": "",
"unk_token": "",
"pad_token": "",
"mask_token": "",
"additional_special_tokens": ["<|user|>", "<|bot|>", "<|end|>"]
}
tokenizer.add_special_tokens(special_tokens)
tokenizer.user_token_id = tokenizer.convert_tokens_to_ids("<|user|>")
tokenizer.assistant_token_id = tokenizer.convert_tokens_to_ids("<|bot|>")
chat_template = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '<|end|>\n' }}{% elif message['role'] == 'assistant' %}{{ '<|bot|>\n' + message['content'] + '<|end|>\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{{ eos_token }}"
tokenizer.chat_template = chat_template
def train_model(model, tokenizer, dataset, push):
args = TrainingArguments(
output_dir="model",
num_train_epochs=EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
learning_rate=LEARNING_RATE,
optim="adamw_torch",
warmup_steps=WARMUP_STEPS,
weight_decay=DECAY,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
fp16=FP16,
max_grad_norm=CLIPPING
)
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=args.warmup_steps,
num_training_steps=len(dataset) * args.num_train_epochs // args.gradient_accumulation_steps
)
dataset = dataset.map(lambda examples: format_prompts(examples, tokenizer), batched=True)
trainer = trl.SFTTrainer(
model=model,
tokenizer=tokenizer,
args=args,
train_dataset=dataset,
dataset_text_field='text',
max_seq_length=MAX_SEQ_LENGTH,
optimizers=(optimizer, scheduler)
)
trainer.train()
trained_model = trainer.model
trained_tokenizer = trainer.tokenizer
if push:
repo_id = OUTPUT_REPO
trained_model.push_to_hub(repo_id)
trained_tokenizer.push_to_hub(repo_id)
else:
trained_tokenizer.save_pretrained("tokenizer")
def main(push_to_hub=True):
dataset = load_data()
training_corpus = get_training_corpus(dataset)
tokenizer = create_tokenizer(training_corpus)
configure_tokenizer(tokenizer)
model = create_model(tokenizer)
train_model(model, tokenizer, dataset, push_to_hub)
if __name__ == "__main__":
main(PUSH_TO_HUB)
raise RuntimeError("The script is finished.")